Bladeren bron

update UI

pull/129/head
naibo 1 jaar geleden
bovenliggende
commit
1e2ca08077
21 gewijzigde bestanden met toevoegingen van 370 en 1929 verwijderingen
  1. +84
    -30
      .temp_to_pub/EasySpider_windows_x64/Code/easyspider_executestage.py
  2. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/tasks/124.json
  3. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/tasks/125.json
  4. BIN
      ElectronJS/EasySpider_en.crx
  5. BIN
      ElectronJS/EasySpider_zh.crx
  6. +1
    -1
      ElectronJS/config.json
  7. +3
    -3
      ElectronJS/main.js
  8. +1
    -0
      ElectronJS/server.js
  9. +203
    -0
      ElectronJS/src/delaration.html
  10. +8
    -0
      ElectronJS/src/index.html
  11. +5
    -0
      ElectronJS/src/index.js
  12. +3
    -3
      ElectronJS/src/taskGrid/FlowChart_CN.html
  13. +1
    -0
      ElectronJS/src/taskGrid/newTask.html
  14. +1
    -1869
      ElectronJS/tasks/162.json
  15. +13
    -10
      ElectronJS/update_chrome.py
  16. +1
    -1
      ExecuteStage/.vscode/launch.json
  17. +13
    -5
      ExecuteStage/easyspider_executestage.py
  18. +5
    -1
      ExecuteStage/myChrome.py
  19. +8
    -3
      ExecuteStage/undetected_chromedriver_ES/__init__.py
  20. +14
    -3
      ExecuteStage/undetected_chromedriver_ES/patcher.py
  21. +4
    -0
      Readme.md

+ 84
- 30
.temp_to_pub/EasySpider_windows_x64/Code/easyspider_executestage.py Bestand weergeven

@ -266,18 +266,40 @@ class BrowserThread(Thread):
scrollType = int(para["scrollType"])
try:
if scrollType != 0 and para["scrollCount"] > 0: # 控制屏幕向下滚动
for i in range(para["scrollCount"]):
self.Log("Wait for set second after screen scrolling")
body = self.browser.find_element(
By.CSS_SELECTOR, "body", iframe=para["iframe"])
if scrollType == 1:
body.send_keys(Keys.PAGE_DOWN)
elif scrollType == 2:
if scrollType == 1 or scrollType == 2:
for i in range(para["scrollCount"]):
self.Log("Wait for set second after screen scrolling")
body = self.browser.find_element(
By.CSS_SELECTOR, "body", iframe=para["iframe"])
if scrollType == 1:
body.send_keys(Keys.PAGE_DOWN)
elif scrollType == 2:
body.send_keys(Keys.END)
try:
time.sleep(para["scrollWaitTime"]) # 下拉完等待
except:
pass
elif scrollType == 3:
bodyText = ""
i = 0
while True:
newBodyText = self.browser.page_source
if newBodyText == bodyText:
print("页面已检测不到新内容,停止滚动。")
print("No new content detected on the page, stop scrolling.")
break
else:
bodyText = newBodyText
body = self.browser.find_element(
By.CSS_SELECTOR, "body", iframe=para["iframe"])
body.send_keys(Keys.END)
try:
time.sleep(para["scrollWaitTime"]) # 下拉完等待
except:
pass
print("滚动到底部,第", i + 1, "次。")
print("Scroll to the bottom, the", i + 1, "time.")
i = i + 1
try:
time.sleep(para["scrollWaitTime"]) # 下拉完等待
except:
pass
except:
self.Log('Time out after set seconds when scrolling. ')
self.recordLog('Time out after set seconds when scrolling')
@ -589,9 +611,18 @@ class BrowserThread(Thread):
if int(node["parameters"]["loopType"]) == 0: # 单个元素循环
# 无跳转标签页操作
count = 0 # 执行次数
bodyText = "-"
while True: # do while循环
try:
finished = False
newBodyText = self.browser.page_source
if newBodyText == bodyText: # 如果页面内容无变化
print("页面已检测不到新内容,停止循环。")
print("No new content detected on the page, stop loop.")
finished = True
break
else:
bodyText = newBodyText
element = self.browser.find_element(
By.XPATH, node["parameters"]["xpath"], iframe=node["parameters"]["iframe"])
for i in node["sequence"]: # 挨个执行操作
@ -1190,29 +1221,42 @@ class BrowserThread(Thread):
# p["relativeXPath"] = p["relativeXPath"].lower()
# p["relativeXPath"] = lowercase_tags_in_xpath(p["relativeXPath"])
# 已经有text()或@href了,不需要再加
content_type = ""
if p["relativeXPath"].find("/@href") >= 0 or p["relativeXPath"].find("/text()") >= 0 or p["relativeXPath"].find("::text()") >= 0:
xpath = p["relativeXPath"]
content_type = ""
elif p["nodeType"] == 2:
xpath = p["relativeXPath"] + "/@href"
content_type = "/@href"
elif p["contentType"] == 1:
xpath = p["relativeXPath"] + "/text()"
content_type = "/text()"
elif p["contentType"] == 0:
xpath = p["relativeXPath"] + "//text()"
content_type = "//text()"
xpath = p["relativeXPath"] + content_type
if p["relative"]:
# if p["relativeXPath"] == "":
# content = [loopElementHTML]
# else:
# 如果字串里有//即子孙查找,则不动语句
if p["relativeXPath"].find("//") >= 0:
full_path = "(" + parentPath + \
xpath + ")" + \
"[" + str(index + 1) + "]"
content = pageHTML.xpath(full_path)
if xpath.startswith("/"):
full_path = "(" + parentPath + ")" + \
"[" + str(index + 1) + "]"+ \
p["relativeXPath"] + content_type
else: # 如果是id()这种形式,不需要包parentPath
full_path = xpath
try:
content = pageHTML.xpath(full_path)
except:
content = []
elif not p["relativeXPath"].startswith("/"): # 如果是id()这种形式,不需要包/html/body
try:
content = loopElementHTML.xpath(xpath)
except:
content = []
else:
content = loopElementHTML.xpath(
"/html/body/" + loopElementHTML[0][0].tag + xpath)
else:
if xpath.find("/body") < 0:
if xpath.find("/body") < 0 and xpath.startswith("/"): # 如果是id()或(//div)[1]这种形式,不需要包/html/body
xpath = "/html/body" + xpath
content = pageHTML.xpath(xpath)
if len(content) > 0:
@ -1258,9 +1302,12 @@ class BrowserThread(Thread):
else:
# 如果字串里有//即子孙查找,则不动语句
if p["relativeXPath"].find("//") >= 0:
full_path = "(" + parentPath + \
p["relativeXPath"] + ")" + \
"[" + str(index + 1) + "]"
# full_path = "(" + parentPath + \
# p["relativeXPath"] + ")" + \
# "[" + str(index + 1) + "]"
full_path = "(" + parentPath + ")" + \
"[" + str(index + 1) + "]" + \
p["relativeXPath"]
element = self.browser.find_element(
By.XPATH, full_path, iframe=p["iframe"])
else:
@ -1390,6 +1437,8 @@ if __name__ == '__main__':
if sys.platform == "win32" and platform.architecture()[0] == "32bit":
options.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
option.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
driver_path = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
@ -1397,12 +1446,15 @@ if __name__ == '__main__':
elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
options.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
option.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
driver_path = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
option.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
@ -1422,6 +1474,7 @@ if __name__ == '__main__':
print("Finding chromedriver in EasySpider",
os.getcwd()+"/ElectronJS")
option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
options.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe"
option.add_extension("../ElectronJS/XPathHelper.crx")
else:
@ -1431,10 +1484,7 @@ if __name__ == '__main__':
option.add_experimental_option(
'excludeSwitches', ['enable-automation']) # 以开发者模式
options.add_argument('-ignore-certificate-errors')
options.add_argument('-ignore -ssl-errors')
option.add_argument('-ignore-certificate-errors')
option.add_argument('-ignore -ssl-errors')
# user_data_dir = r'' # 注意没有Default!
# options.add_argument('--user-data-dir='+p)
@ -1496,6 +1546,8 @@ if __name__ == '__main__':
except:
cloudflare = 0
if cloudflare == 0:
options.add_argument('log-level=3') # 隐藏日志
option.add_argument('log-level=3') # 隐藏日志
options.add_experimental_option("prefs", {
# 设置文件下载路径
"download.default_directory": "Data/Task_" + str(i),
@ -1526,10 +1578,9 @@ if __name__ == '__main__':
options=options, chrome_options=option, executable_path=driver_path)
elif cloudflare == 1:
if sys.platform != "darwin":
options.binary_location = "" # 需要用自己的浏览器
browser_t = MyUCChrome(
options=options, chrome_options=option, driver_executable_path=driver_path)
print("Pass Cloudflare Mode")
print("过Cloudflare验证模式")
options=options, driver_executable_path=driver_path)
else:
print("Not support Cloudflare Mode on MacOS")
print("MacOS不支持Cloudflare验证模式")
@ -1556,6 +1607,9 @@ if __name__ == '__main__':
print("正在运行任务,长按键盘p键可暂停任务的执行以便手工操作浏览器如输入验证码;如果想恢复任务的执行,请再次长按p键。")
print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
print("----------------------------------\n\n")
if cloudflare:
print("过Cloudflare验证模式有时候会不稳定,请注意观察上方提示的浏览器版本信息是否正确,如果无法通过验证则需要隔几分钟重试一次,或者可以更换新的用户信息文件夹再执行任务。")
print("Passing the Cloudflare verification mode is sometimes unstable. Please pay attention to whether the browser version information prompted above is correct. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.")
# 使用监听器监听键盘输入
try:
with Listener(on_press=on_press_creator(press_time, event), on_release=on_release_creator(event, press_time)) as listener:

+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/tasks/124.json
Diff onderdrukt omdat het te groot bestand
Bestand weergeven


+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/tasks/125.json Bestand weergeven

@ -0,0 +1 @@
{"id":125,"name":"Just a moment...","url":"https://portal.ustraveldocs.com/scheduleappointment","links":"https://portal.ustraveldocs.com/scheduleappointment","create_time":"7/12/2023, 11:21:54 AM","update_time":"7/12/2023, 11:23:01 AM","version":"0.3.5","saveThreshold":10,"cloudflare":1,"environment":0,"maxViewLength":15,"outputFormat":"xlsx","saveName":"current_time","containJudge":false,"desc":"https://portal.ustraveldocs.com/scheduleappointment","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://portal.ustraveldocs.com/scheduleappointment","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://portal.ustraveldocs.com/scheduleappointment"}],"outputParameters":[{"id":0,"name":"参数2_文本","desc":"","type":"text","recordASField":1,"exampleValue":"\n 使用条款及细则 (Terms & Conditions)\n \n 在本网站所支付的所有费用均不予退还。请确保您已付款,并获得了收据号码。\n 签证不能保证进入美国。\n 签证允许外国公民进入美国口岸并提出入境申请。\n 只有美国国土安全部和美国海关与边境保护局(CBP)官员可以决定签证持有人能否入境。\n 您不能使用过期签证进入美国。当您进入美国时签证必须是有效的。\n \n "}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0,"waitType":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":15,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://portal.ustraveldocs.com/scheduleappointment","links":"https://portal.ustraveldocs.com/scheduleappointment","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":3,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":0,"relative":false,"name":"参数2_文本","desc":"","extractType":0,"relativeXPath":"/html/body/div[1]/div[1]/div[1]/form[1]/span[1]/div[1]","allXPaths":["/html/body/div[1]/div[1]/div[1]/form[1]/span[1]/div[1]","//div[contains(., '使')]","//DIV[@class='span-6 last']","/html/body/div[last()-5]/div/div/form/span/div"],"exampleValues":[{"num":0,"value":"\n 使用条款及细则 (Terms & Conditions)\n \n 在本网站所支付的所有费用均不予退还。请确保您已付款,并获得了收据号码。\n 签证不能保证进入美国。\n 签证允许外国公民进入美国口岸并提出入境申请。\n 只有美国国土安全部和美国海关与边境保护局(CBP)官员可以决定签证持有人能否入境。\n 您不能使用过期签证进入美国。当您进入美国时签证必须是有效的。\n \n "}],"unique_index":"p3h5p8qfeyljz5n60b","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}

BIN
ElectronJS/EasySpider_en.crx Bestand weergeven


BIN
ElectronJS/EasySpider_zh.crx Bestand weergeven


+ 1
- 1
ElectronJS/config.json Bestand weergeven

@ -1 +1 @@
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"D:\\Document\\Projects\\EasySpider\\ElectronJS\\user_data"}
{"webserver_address":"http://localhost","webserver_port":8074,"user_data_folder":"./user_data","debug":false,"copyright":0,"mysql_config_path":"./mysql_config.json","absolute_user_data_folder":"D:\\Document\\Projects\\EasySpider\\ElectronJS\\user_data"}

+ 3
- 3
ElectronJS/main.js Bestand weergeven

@ -112,12 +112,12 @@ function createWindow() {
},
icon: iconPath,
// frame: false, //取消window自带的关闭最小化等
resizable: false //禁止改变主窗口尺寸
// resizable: false //禁止改变主窗口尺寸
})
// and load the index.html of the app.
// mainWindow.loadFile('src/index.html');
mainWindow.loadURL(server_address + '/index.html?user_data_folder=' + config.user_data_folder, { extraHeaders: 'pragma: no-cache\n' });
mainWindow.loadURL(server_address + '/index.html?user_data_folder=' + config.user_data_folder+"&copyright=" + config.copyright, { extraHeaders: 'pragma: no-cache\n' });
// 隐藏菜单栏
const {Menu} = require('electron');
Menu.setApplicationMenu(null);
@ -126,7 +126,7 @@ function createWindow() {
app.quit();
}
});
// mainWindow.webContents.openDevTools();
mainWindow.webContents.openDevTools();
// Open the DevTools.
// mainWindow.webContents.openDevTools()
}

+ 1
- 0
ElectronJS/server.js Bestand weergeven

@ -64,6 +64,7 @@ if(!fs.existsSync(path.join(getDir(), "config.json"))){
"webserver_port": 8074,
"user_data_folder": "./user_data",
"debug": false,
"copyright": 0,
"mysql_config_path": "./mysql_config.json",
"absolute_user_data_folder": "D:\\Document\\Projects\\EasySpider\\ElectronJS\\user_data"
}

+ 203
- 0
ElectronJS/src/delaration.html Bestand weergeven

@ -0,0 +1,203 @@
<!DOCTYPE html>
<html">
<head>
<script src="js/jquery-3.4.1.min.js"></script>
<meta charset="UTF-8">
<meta content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0"
name="viewport">
<meta content="ie=edge" http-equiv="X-UA-Compatible">
<script src="js/vue.global.js"></script>
<!-- <script src="https://unpkg.com/vue-i18n@9"></script> -->
<link href="bootstrap/css/bootstrap.css" rel="stylesheet">
</link>
<title>EasySpider: NoCode Visual Web Crawler</title>
</head>
<style>
.img-container {
display: flex;
flex-direction: column;
align-items: center;
justify-content: center;
margin-top: 40px;
}
.img-container img {
/*max-width: 50%;*/
height: 75px;
margin-top: 10px;
margin-bottom: 10px; /* 可根据需要调整图片之间的间距 */
}
</style>
<body>
<div id="app">
<div style="padding: 10px; text-align: center;vertical-align: middle;" v-if="init">
<h5 style="margin-top: 20px">选择语言/Select Language</h5>
<p><a @click="changeLang('zh')" class="btn btn-outline-primary btn-lg"
style="margin-top: 15px; width: 300px;height:60px;padding-top:12px;">中文</a></p>
<p><a @click="changeLang('en')" class="btn btn-outline-primary btn-lg"
style="margin-top: 15px; width: 300px;height:60px;padding-top:12px;">English</a></p>
<p style="font-size: 17px">当前版本/Current Version: <b>v0.3.5</b></p>
<p style="font-size: 17px"><a href="https://github.com/NaiboWang/EasySpider/releases" target="_blank">Github</a>最新版本/Newest Version:<b>{{newest_version}}</b></p>
<!-- <p>如发现新版本更新,可从以下Github仓库下载最新版本使用/If a new version is found, you can download the latest version from the following Github repository:</p>-->
<!-- <p></p>-->
<div class="img-container">
<!-- <h5>出品方/Producer</h5>-->
<a href="https://www.zju.edu.cn" alt="浙江大学 Zhejiang University" target="_blank"><img src="img/zju.png"></a>
<a href="https://www.nus.edu.sg" alt="新加坡国立大学 National University of Singpaore" target="_blank"><img src="img/nuslogo.png"></a>
<a href="https://www.xidian.edu.cn" alt="西安电子科技大学 Xidian University" target="_blank"><img src="img/xidian.png"></a>
</div>
</div>
<div style="padding: 10px; text-align: center;vertical-align: middle;" v-else>
<div v-if="lang=='en'">
<div v-if="step == 0">
<p style="margin-top: 20px">Hint: Click Button below to start.</p>
<p><a @click="step = 1"
class="btn btn-primary btn-lg"
style="margin-top: 15px; width: 300px;height:60px;padding-top:12px;color:white">Design/Modify Task</a>
</p>
<p><a @click="startInvoke('en')"
@click class="btn btn-primary btn-lg"
style="margin-top: 15px; width: 300px;height:60px;padding-top:12px;color:white">View/Manage/Invoke
Tasks</a></p>
<p>
<a href="https://www.easyspider.cn/index_english.html" target="_blank" style="text-align: center; font-size: 18px">Browse official website to watch tutorials</a>
</p>
<div class="img-container">
<!-- <h5>Producer</h5>-->
<a href="https://www.zju.edu.cn" alt="Zhejiang University" target="_blank"><img src="img/zju.png"></a>
<a href="https://www.nus.edu.sg" alt="National University of Singapore" target="_blank"><img src="img/nuslogo.png"></a>
<a href="https://www.xidian.edu.cn" alt="Xidian University" target="_blank"><img src="img/xidian.png"></a>
</div>
</div>
<div v-else-if="step == 1">
<h4 style="margin-top: 20px">Please select design mode</h4>
<p style="margin-top: 20px; text-align: justify; width:310px; margin-left: 18%">
Clean Mode: Start with a clean browser with no cookie/user data.</p>
<p style="text-align: justify; width:310px; margin-left: 18%">
Data Mode: Start with a browser that stores user data such as website login information and cookies.</p>
<p><a @click="startDesign('en')"
class="btn btn-primary btn-lg"
style="margin-top: 15px; width: 300px;height:60px;padding-top:12px;color:white">Start Clean Mode</a>
</p>
<p><a @click="startDesign('en', false, true)"
class="btn btn-primary btn-lg"
style="margin-top: 15px; width: 300px;height:60px;padding-top:12px;color:white">Clean Mode (Mobile)</a>
</p>
<p><a @click="step = 2" @click
class="btn btn-primary btn-lg"
style="margin-top: 15px; width: 300px;height:60px;padding-top:12px;color:white">Start Data Mode</a>
</p>
<a @click="step = 0" class="btn btn-outline-primary btn-lg"style="margin-top: 10px; width: 302px;height:45px;padding-top:5px">Go to Home Page</a>
</div>
<div v-else-if="step == 2">
<h4 style="margin-top: 20px">Specify user data folder</h4>
<div style="margin: 0 auto; width:90%">
<p style="margin-top: 20px; text-align: justify">
Please specify the directory of user data below. Once set, the browser will load cookies and other contents such as user login information from this directory. The browser will load data from this directory every time it is designed and executed, as long as the directory remains the same. </p>
<p style="text-align: justify">For example, if the <b>./user_data</b> folder is set and you log in at <b>ebay.com</b> during the design process, then the previous login status will still be retained when you specify the <b>./user_data</b> folder again for the next design or task execution when you open <b>ebay.com</b>.</p>
<p style="text-align: justify">If there are multiple configurations, different directories can be set for each configuration. Each directory will be treated as a separate configuration set, and if a directory does not exist, it will be created automatically.</p>
<p><textarea class="form-control" style="min-height: 50px;"
v-model="user_data_folder"></textarea>
</p>
</div>
<p><a @click="startDesign('en', true)"
class="btn btn-primary btn-lg"
style="margin-top: 15px; width: 300px;height:60px;padding-top:12px;color:white">Start Design</a></p>
<p>
<p><a @click="startDesign('en', true, true)"
class="btn btn-primary btn-lg"
style="margin-top: 15px; width: 300px;height:60px;padding-top:12px;color:white">Start Design (Mobile)</a></p>
<p>
<a @click="step = 0" class="btn btn-outline-primary btn-lg"style="margin-top: 10px; width: 302px;height:45px;padding-top:5px">Go to Home Page</a>
</p>
</div>
</div>
<div v-else-if="lang=='zh'">
<div v-if="step == 0">
<p style="margin-top: 20px">提示:点击下方按钮开始使用。</p>
<p><a @click="step = 1" class="btn btn-primary btn-lg"
style="margin-top: 15px; width: 300px;height:60px;padding-top:12px;color:white">设计/修改任务</a></p>
<p><a @click="startInvoke('zh')"
@click class="btn btn-primary btn-lg"
style="margin-top: 15px; width: 300px;height:60px;padding-top:12px;color:white">查看/管理/执行任务</a>
</p>
<p>
<a href="https://www.easyspider.cn?lang=zh" target="_blank" style="text-align: center; font-size: 18px">点此访问官网查看文档/视频教程</a>
</p>
<div class="img-container">
<!-- <h5>出品方</h5>-->
<a href="https://www.zju.edu.cn" alt="浙江大学" target="_blank"><img src="img/zju.png"></a>
<a href="https://www.nus.edu.sg" alt= "新加坡国立大学" target="_blank"><img src="img/nuslogo.png"></a>
<a href="https://www.xidian.edu.cn" alt="西安电子科技大学" target="_blank"><img src="img/xidian.png"></a>
</div>
</div>
<div v-else-if="step == 1">
<h4 style="margin-top: 20px">请选择设计模式</h4>
<p style="margin-top: 20px; text-align: left; width:320px; margin-left: 18%">
纯净版浏览器:无任何用户信息的浏览器。</p>
<p style="text-align: left; width:320px; margin-left: 18%">
带用户信息的浏览器:保存有用户数据,如网站的登录信息,cookie的浏览器。</p>
<p><a @click="startDesign('zh')"
class="btn btn-primary btn-lg"
style="margin-top: 15px; width: 320px;height:60px;padding-top:12px;color:white;">使用纯净版浏览器设计</a>
</p>
<p><a @click="startDesign('zh', false, true)"
class="btn btn-primary btn-lg"
style="margin-top: 15px; width: 320px;height:60px;padding-top:12px;color:white;">纯净版浏览器设计(手机模式)</a>
</p>
<p><a @click="step = 2" class="btn btn-primary btn-lg"
style="margin-top: 15px; width: 320px;height:60px;padding-top:12px;color:white">使用带用户信息浏览器设计</a>
</p>
<p>
<a @click="step = 0" class="btn btn-outline-primary btn-lg"style="margin-top: 10px; width: 322px;height:45px;padding-top:5px">返回首页</a>
</p>
</div>
<div v-else-if="step == 2">
<h4 style="margin-top: 20px">指定用户信息目录</h4>
<div style="margin: 0 auto; width:90%">
<p style="margin-top: 20px; text-align: left">
请在下方指定用户信息目录。设置后,浏览器将加载目录里的cookie,如用户的登录信息等内容,目录不变的情况下,每次设计和执行时浏览器都会加载此目录里的数据。</p>
<p style="margin-top: 10px; text-align: left">例如:设置了./user_data文件夹,并在设计过程中登录了知乎网站,则下次再次设计或者执行任务时指定./user_data文件夹,打开知乎网站页面会仍然保留之前的登录状态。</p>
<p style="margin-top: 10px; text-align: left">如果有多套配置,可以设置不同的目录,每个目录为一套,如果目录不存在将会被自动创建。</p>
<p><textarea class="form-control" style="min-height: 50px;"
v-model="user_data_folder"></textarea>
</p>
</div>
<p><a @click="startDesign('zh', true)"
class="btn btn-primary btn-lg"
style="margin-top: 15px; width: 320px;height:60px;padding-top:12px;color:white">开始设计</a></p>
<p>
<p><a @click="startDesign('zh', true, true)"
class="btn btn-primary btn-lg"
style="margin-top: 15px; width: 320px;height:60px;padding-top:12px;color:white">开始设计(手机模式)</a></p>
<p>
<a @click="step = 0" class="btn btn-outline-primary btn-lg"style="margin-top: 10px; width: 322px;height:45px;padding-top:5px">返回首页</a>
</p>
</div>
</div>
</div>
</div>
<script type="module" src="index.js"></script>
</body>
</html>

+ 8
- 0
ElectronJS/src/index.html Bestand weergeven

@ -55,6 +55,9 @@
<div style="padding: 10px; text-align: center;vertical-align: middle;" v-else>
<div v-if="lang=='en'">
<div v-if="step == -1">
TEST
</div>
<div v-if="step == 0">
<p style="margin-top: 20px">Hint: Click Button below to start.</p>
@ -125,6 +128,11 @@
</div>
</div>
<div v-else-if="lang=='zh'">
<div v-if="step == -1">
<h4 style="margin-top: 20px">版权和注意事项声明</h4>
<p>请仔细阅读下方有关软件使用和商用付费的说明,并接受使用协议以使用本软件。</p>
<textarea class="form-control" style="min-height: 200px;" readonly>
</div>
<div v-if="step == 0">
<p style="margin-top: 20px">提示:点击下方按钮开始使用。</p>

+ 5
- 0
ElectronJS/src/index.js Bestand weergeven

@ -24,11 +24,16 @@ var app = Vue.createApp({
init: true,
lang: 'zh',
user_data_folder: getUrlParam("user_data_folder"),
copyright: 0,
step: 0,
newest_version: '-', // 最新版本号
}
},
mounted() {
this.copyright = parseInt(getUrlParam("copyright"));
if(this.copyright == 0){
this.step = -1;
}
// 发送GET请求获取GitHub的Release API响应
const request = new XMLHttpRequest();
request.open('GET', `https://api.github.com/repos/NaiboWang/EasySpider/releases/latest`);

+ 3
- 3
ElectronJS/src/taskGrid/FlowChart_CN.html Bestand weergeven

@ -110,7 +110,7 @@
<option value = 0>不滚动</option>
<option value = 1>向下滚动一屏</option>
<option value = 2>滚动到底部</option>
<option value = 3>一直滚动直到页面内容无变化(需设置好滚动后的等待时间,等待时间太短容易检测不到新数据</option>
<option value = 3>一直滚动直到页面内容无变化(需设置好滚动后的等待时间用于检测页面变化</option>
</select>
<label>滚动次数(滚动类型设置为<b>不滚动</b><b>一直滚动</b>时请忽略此项):</label>
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['scrollCount']" type="number" required></input>
@ -156,7 +156,7 @@
<option value = 0>不滚动</option>
<option value = 1>向下滚动一屏</option>
<option value = 2>滚动到底部</option>
<option value = 3>一直滚动直到页面内容无变化(需设置好滚动后的等待时间,等待时间太短容易检测不到新数据</option>
<option value = 3>一直滚动直到页面内容无变化(需设置好滚动后的等待时间用于检测页面变化</option>
</select>
<label>滚动次数(滚动类型设置为<b>不滚动</b><b>一直滚动</b>时请忽略此项):</label>
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['scrollCount']" type="number" required></input>
@ -477,7 +477,7 @@
<option value = 0>不滚动</option>
<option value = 1>向下滚动一屏</option>
<option value = 2>滚动到底部</option>
<option value = 3>一直滚动直到页面内容无变化(需设置好滚动后的等待时间,等待时间太短容易检测不到新数据</option>
<option value = 3>一直滚动直到页面内容无变化(需设置好滚动后的等待时间用于检测页面变化</option>
</select>
<label>滚动次数(滚动类型设置为<b>不滚动</b><b>一直滚动</b>时请忽略此项):</label>
<input onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['scrollCount']" type="number" required></input>

+ 1
- 0
ElectronJS/src/taskGrid/newTask.html Bestand weergeven

@ -48,6 +48,7 @@
<p></p>
<h5>{{"Example 2~示例2" | lang}}</h5>
<p>{{"(右键)选中一个商品标题,同类型标题会被自动匹配,点击“选中全部”选项 -> 点击“采集数据”选项,即可采集到所有商品的标题信息。~ (Right Click) Select a product title, the same type of title will be automatically matched, click the 'Select All' option -> Click the 'Collect Data' option, you can collect the title information of all products." | lang}}</p>
<p>{{"同时,选中全部后如果选择“循环点击每个元素”选项,即可自动打开每个商品的详情页,然后再采集详情页的信息。~ At the same time, if you select the 'Loop-click every element' after selecting all, you can automatically open the details page of each product, and then collect the information of the details page." | lang}}</p>
<img src="../img/animation_en.gif" alt="" style="width: 100%;height: 100%">
<p></p>
</div>

+ 1
- 1869
ElectronJS/tasks/162.json
Diff onderdrukt omdat het te groot bestand
Bestand weergeven


+ 13
- 10
ElectronJS/update_chrome.py Bestand weergeven

@ -51,7 +51,9 @@ def get_chrome_version():
return "115"
update_version = get_chrome_version() # 要更新的chromedriver版本
chrome_version = get_chrome_version() # 要更新的chromedriver版本
print("Detected your chrome version is: ", chrome_version)
chrome_driver_url = "https://googlechromelabs.github.io/chrome-for-testing/known-good-versions-with-downloads.json"
win64_chrome_path = "C:\\Program Files\\Google\\Chrome\\Application"
@ -84,7 +86,7 @@ if __name__ == "__main__":
versions = versions[::-1] # 倒序排列数组
for info in versions:
version = info["version"]
if version.find(update_version) >= 0:
if version.find(chrome_version) >= 0:
downloads = info["downloads"]
if "chromedriver" in downloads:
print(info["version"])
@ -93,11 +95,11 @@ if __name__ == "__main__":
else:
print("Error: " + response.status_code)
exit(1)
if not driver_downloads and int(update_version) < 115:
if update_version not in old_driver_version:
if not driver_downloads and int(chrome_version) < 115:
if chrome_version not in old_driver_version:
print("没有可用的chromedriver")
exit(1)
full_version = old_driver_version[update_version]
full_version = old_driver_version[chrome_version]
driver_downloads = [
{
"platform": "linux64",
@ -120,6 +122,7 @@ if __name__ == "__main__":
"url": f"http://chromedriver.storage.googleapis.com/{full_version}/chromedriver_win32.zip",
},
]
if os.path.exists("./chromedrivers"):
shutil.rmtree("./chromedrivers")
os.mkdir("./chromedrivers")
@ -127,7 +130,7 @@ if __name__ == "__main__":
for download in driver_downloads:
if download["platform"] == "win64":
url = download["url"]
print(url)
print("ChromeDriver will be downloaded from: ", url)
break
download_and_extract_zip(url, "./chromedrivers")
if os.path.exists("./chrome_win64"):
@ -154,7 +157,7 @@ if __name__ == "__main__":
for download in driver_downloads:
if download["platform"] == "win32":
url = download["url"]
print(url)
print("ChromeDriver will be downloaded from: ", url)
break
download_and_extract_zip(url, "./chromedrivers")
if os.path.exists("./chrome_win32"):
@ -167,13 +170,13 @@ if __name__ == "__main__":
copy_file("./stealth.min.js", "./chrome_win32/stealth.min.js")
try:
copy_file(
"./chromedrivers/chromedriver-win64/chromedriver.exe",
"./chrome_win64/chromedriver_win64.exe",
"./chromedrivers/chromedriver-win32/chromedriver.exe",
"./chrome_win32/chromedriver_win32.exe",
)
except:
copy_file(
"./chromedrivers/chromedriver.exe",
"./chrome_win64/chromedriver_win64.exe",
"./chrome_win32/chromedriver_win64.exe",
)
finally:
shutil.rmtree("./chromedrivers")

+ 1
- 1
ExecuteStage/.vscode/launch.json Bestand weergeven

@ -12,7 +12,7 @@
"justMyCode": false,
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
"args": ["--id", "[3]", "--headless", "0", "--user_data", "1"]
"args": ["--id", "[4]", "--headless", "0", "--user_data", "1"]
}
]
}

+ 13
- 5
ExecuteStage/easyspider_executestage.py Bestand weergeven

@ -1437,6 +1437,8 @@ if __name__ == '__main__':
if sys.platform == "win32" and platform.architecture()[0] == "32bit":
options.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
option.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
driver_path = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
@ -1444,12 +1446,15 @@ if __name__ == '__main__':
elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
options.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
option.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
driver_path = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
option.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
@ -1469,6 +1474,7 @@ if __name__ == '__main__':
print("Finding chromedriver in EasySpider",
os.getcwd()+"/ElectronJS")
option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
options.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe"
option.add_extension("../ElectronJS/XPathHelper.crx")
else:
@ -1478,8 +1484,7 @@ if __name__ == '__main__':
option.add_experimental_option(
'excludeSwitches', ['enable-automation']) # 以开发者模式
options.add_argument('log-level=3') # 隐藏日志
option.add_argument('log-level=3') # 隐藏日志
# user_data_dir = r'' # 注意没有Default!
# options.add_argument('--user-data-dir='+p)
@ -1541,6 +1546,8 @@ if __name__ == '__main__':
except:
cloudflare = 0
if cloudflare == 0:
options.add_argument('log-level=3') # 隐藏日志
option.add_argument('log-level=3') # 隐藏日志
options.add_experimental_option("prefs", {
# 设置文件下载路径
"download.default_directory": "Data/Task_" + str(i),
@ -1571,8 +1578,9 @@ if __name__ == '__main__':
options=options, chrome_options=option, executable_path=driver_path)
elif cloudflare == 1:
if sys.platform != "darwin":
options.binary_location = "" # 需要用自己的浏览器
browser_t = MyUCChrome(
options=options, chrome_options=option, driver_executable_path=driver_path)
options=options)
else:
print("Not support Cloudflare Mode on MacOS")
print("MacOS不支持Cloudflare验证模式")
@ -1600,8 +1608,8 @@ if __name__ == '__main__':
print("Running task, long press 'p' to pause the task for manual operation of the browser such as entering the verification code; If you want to resume the execution of the task, please long press 'p' again.")
print("----------------------------------\n\n")
if cloudflare:
print("过Cloudflare验证模式有时候会不稳定,如果无法通过验证则需要隔几分钟重试一次,或者可以更换新的用户信息文件夹再执行任务。")
print("Passing Cloudflare verification mode is sometimes unstable, if you cannot pass the verification, you need to try again every few minutes, or you can change a new user information folder and then execute the task.")
print("过Cloudflare验证模式有时候会不稳定,请注意观察上方提示的浏览器版本信息是否正确,如果无法通过验证则需要隔几分钟重试一次,或者可以更换新的用户信息文件夹再执行任务。")
print("Passing the Cloudflare verification mode is sometimes unstable. Please pay attention to whether the browser version information prompted above is correct. If the verification fails, you need to try again every few minutes, or you can change to a new user information folder and then execute the task.")
# 使用监听器监听键盘输入
try:
with Listener(on_press=on_press_creator(press_time, event), on_release=on_release_creator(event, press_time)) as listener:

+ 5
- 1
ExecuteStage/myChrome.py Bestand weergeven

@ -91,7 +91,11 @@ class MyChrome(webdriver.Chrome):
import sys
if sys.platform != "darwin": # MacOS不支持Cloudflare
import undetected_chromedriver_ES as uc
ES = 1
if ES == 1:
import undetected_chromedriver as uc
else:
import undetected_chromedriver as uc
class MyUCChrome(uc.Chrome):

+ 8
- 3
ExecuteStage/undetected_chromedriver_ES/__init__.py Bestand weergeven

@ -254,7 +254,7 @@ class Chrome(selenium.webdriver.chrome.webdriver.WebDriver):
user_multi_procs=user_multi_procs,
)
# self.patcher.auto(user_multiprocess = user_multi_num_procs)
self.patcher.auto()
chrome_version = self.patcher.auto()
# self.patcher = patcher
if not options:
@ -369,8 +369,10 @@ class Chrome(selenium.webdriver.chrome.webdriver.WebDriver):
if not options.binary_location:
options.binary_location = (
browser_executable_path or find_chrome_executable()
browser_executable_path or find_chrome_executable(chrome_version)
)
print("Options Binary Location: ", options.binary_location)
self._delay = 3
@ -811,7 +813,7 @@ class Chrome(selenium.webdriver.chrome.webdriver.WebDriver):
self.service.process.kill()
def find_chrome_executable():
def find_chrome_executable(version):
"""
Finds the chrome, chrome beta, chrome canary, chromium executable
@ -853,4 +855,7 @@ def find_chrome_executable():
candidates.add(os.sep.join((item, subitem, "chrome.exe")))
for candidate in candidates:
if os.path.exists(candidate) and os.access(candidate, os.X_OK):
print("\n\n\n软件将会使用以下目录的Chrome浏览器:", os.path.normpath(candidate), ",请检查此浏览器版本是否为" + str(version) + "版本,如果不是将无法运行。")
print("The software will use the Chrome browser in the following directory:", os.path.normpath(candidate), "Please check if the version of this browser is version " + str(version) + ", if not, it will not be able to run.\n\n\n")
time.sleep(5)
return os.path.normpath(candidate)

+ 14
- 3
ExecuteStage/undetected_chromedriver_ES/patcher.py Bestand weergeven

@ -10,6 +10,7 @@ import random
import re
import shutil
import string
import subprocess
import sys
import time
from urllib.request import urlopen
@ -134,11 +135,21 @@ class Patcher(object):
if not os.path.exists(new_file):
shutil.copy(self.executable_path, new_file)
self.executable_path = new_file # 用新的chromedriver
print(f"New chromedriver path: {self.executable_path}")
ispatched = self.is_binary_patched(self.executable_path)
folder_path = os.path.dirname(os.path.abspath(self.executable_path))
folder_list = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
numeric_folders = [f for f in folder_list if f[0].isdigit()]
version = numeric_folders[0].split('.')[0]
print(f"\n\n\nCloudflare下需要自行安装浏览器,请确保自己的机器环境已经安装了 {numeric_folders[0].split('.')[0]} 版本的Chrome浏览器(不是软件自带的Chrome浏览器,需要自己安装浏览器且版本号一定要正确),否则程序无法运行!")
print("Please make sure that your machine environment has installed the Chrome browser version %s (not the Chrome browser provided by the software, you need to install the browser yourself and the version number must be correct), otherwise the program cannot run!" % numeric_folders[0].split('.')[0])
if not ispatched:
return self.patch_exe()
print("Patching chromedriver...")
return version
else:
return
print("No need to patch chromedriver.")
return version
if version_main:
self.version_main = version_main
@ -296,7 +307,7 @@ class Patcher(object):
def patch_exe(self):
start = time.perf_counter()
logger.info("patching driver executable %s" % self.executable_path)
print("patching driver executable %s" % self.executable_path)
with io.open(self.executable_path, "r+b") as fh:
content = fh.read()
# match_injected_codeblock = re.search(rb"{window.*;}", content)

+ 4
- 0
Readme.md Bestand weergeven

@ -28,8 +28,12 @@ A visual code-free/no-code web crawler/spider, just select the content you want
(右键)选中一个商品标题,同类型标题会被自动匹配,点击“选中全部”选项 -> 点击“采集数据”选项,即可采集到所有商品的标题信息。
同时,选中全部后如果选择“循环点击每个元素”选项,即可自动打开每个商品的详情页,然后再采集详情页的信息。
(Right Click) Select a product title, the same type of title will be automatically matched, click the 'Select All' option -> Click the 'Collect Data' option, you can collect the title information of all products.
At the same time, if you select the 'Loop-click every element' option after selecting all, you can automatically open the details page of each product, and then collect the information of the details page.
![animation_en](media/animation_en.gif)
### 更多特性/More Features

Laden…
Annuleren
Opslaan