Selaa lähdekoodia

Deal with data:

pull/247/head
naibo 10 kuukautta sitten
vanhempi
commit
4025e255a0
15 muutettua tiedostoa jossa 40 lisäystä ja 10 poistoa
  1. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/126.json
  2. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/127.json
  3. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/128.json
  4. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/129.json
  5. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/130.json
  6. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/131.json
  7. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/132.json
  8. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/133.json
  9. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/134.json
  10. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/tasks/235.json
  11. +5
    -2
      ElectronJS/server.js
  12. +1
    -1
      ElectronJS/src/taskGrid/logic.js
  13. +1
    -1
      ElectronJS/tasks/209.json
  14. +1
    -1
      ExecuteStage/.vscode/launch.json
  15. +22
    -5
      ExecuteStage/easyspider_executestage.py

+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/126.json Näytä tiedosto

@ -0,0 +1 @@
{"id":126,"name":"【软科排名】-中国最好学科排名|最权威的大学学科|高校学科排名","url":"https://www.shanghairanking.cn/rankings/bcsr/2023","links":"https://www.shanghairanking.cn/rankings/bcsr/2023","create_time":"11/23/2023, 3:32:45 AM","update_time":"11/23/2023, 3:32:45 AM","version":"0.5.0","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"containJudge":false,"desc":"https://www.shanghairanking.cn/rankings/bcsr/2023","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.shanghairanking.cn/rankings/bcsr/2023","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.shanghairanking.cn/rankings/bcsr/2023"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"text","recordASField":1,"exampleValue":"哲学"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.shanghairanking.cn/rankings/bcsr/2023","links":"https://www.shanghairanking.cn/rankings/bcsr/2023","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/div/div[2]/div/a[1]/span[2]","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[1]/div[1]/div[1]/div[2]/div[2]/div[1]/div[1]/div[2]/div[1]/a[1]/span[2]","//span[contains(., '哲学')]","/html/body/div[last()-3]/div/div/div[last()-2]/div/div[last()-1]/div[last()-11]/div/div/a/span"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"paras":[{"nodeType":0,"contentType":0,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"哲学"}],"unique_index":"v5hqcije1galpa5w28w","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}

+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/127.json
File diff suppressed because it is too large
Näytä tiedosto


+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/128.json
File diff suppressed because it is too large
Näytä tiedosto


+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/129.json
File diff suppressed because it is too large
Näytä tiedosto


+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/130.json
File diff suppressed because it is too large
Näytä tiedosto


+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/131.json
File diff suppressed because it is too large
Näytä tiedosto


+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/132.json
File diff suppressed because it is too large
Näytä tiedosto


+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/133.json
File diff suppressed because it is too large
Näytä tiedosto


+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/134.json
File diff suppressed because it is too large
Näytä tiedosto


+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/tasks/235.json
File diff suppressed because it is too large
Näytä tiedosto


+ 5
- 2
ElectronJS/server.js Näytä tiedosto

@ -323,8 +323,11 @@ exports.start = function(port = 8074) {
task = JSON.parse(task);
try{
task["links"] = data["urlList_0"];
}catch(error){
console.log(error);
if (tasks["links"] == undefined) {
task["links"] = "about:blank";
}
} catch(error) {
task["links"] = "about:blank";
}
for (const [key, value] of Object.entries(data)) {
for (let i = 0; i < task["inputParameters"].length; i++) {

+ 1
- 1
ElectronJS/src/taskGrid/logic.js Näytä tiedosto

@ -359,7 +359,7 @@ function saveService(type) {
let outputNames = [];
let inputIndex = 0;
let outputIndex = 0;
let links = ""; //记录所有的link
let links = "about:blank"; //记录所有的link
let containJudge = false; //是否含有判断语句
let saveThreshold = parseInt($("#saveThreshold").val());
let cloudflare = parseInt($("#cloudflare").val());

+ 1
- 1
ElectronJS/tasks/209.json
File diff suppressed because it is too large
Näytä tiedosto


+ 1
- 1
ExecuteStage/.vscode/launch.json Näytä tiedosto

@ -12,7 +12,7 @@
"justMyCode": false,
// "args": ["--id", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--id", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
"args": ["--id", "[125]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
"args": ["--id", "[134]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
}
]
}

+ 22
- 5
ExecuteStage/easyspider_executestage.py Näytä tiedosto

@ -337,7 +337,7 @@ class BrowserThread(Thread):
if "urlList_0" in data.keys():
self.links = data["urlList_0"]
except:
pass
self.links =pan> "about:blank"
task = self.service
for key, value in data.items():
for i in range(len(task["inputParameters"])):
@ -987,7 +987,8 @@ class BrowserThread(Thread):
self.print_and_log("Loop element not found: ",
xpath)
self.print_and_log("找不到循环元素: ", xpath)
for index in range(len(elements)):
index = 0
while index < len(elements):
for i in node["sequence"]: # 挨个顺序执行循环里所有的操作
self.executeNode(i, elements[index],
xpath, index)
@ -1033,16 +1034,22 @@ class BrowserThread(Thread):
except:
pass
if self.browser.current_url.startswith("data:"):
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
try:
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
except: # 超时的情况下
pass
time.sleep(2)
elements = self.browser.find_elements(By.XPATH,
xpath, iframe=node["parameters"]["iframe"])
if index > 0:
index -= 1 # 如果是data:开头的网址,就要重试一次
if int(node["parameters"]["breakMode"]) > 0: # 如果设置了退出循环的脚本条件
output = self.execute_code(int(
node["parameters"]["breakMode"]) - 1, node["parameters"]["breakCode"], node["parameters"]["breakCodeWaitTime"], iframe=node["parameters"]["iframe"])
code = get_output_code(output)
if code <= 0:
break
index = index + 1
except NoSuchElementException:
self.print_and_log("Loop element not found: ", xpath)
self.print_and_log("找不到循环元素: ", xpath)
@ -1050,7 +1057,11 @@ class BrowserThread(Thread):
raise
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
# 千万不要忘了分割!!
for path in node["parameters"]["pathList"].split("\n"):
paths = node["parameters"]["pathList"].split("\n")
# for path in node["parameters"]["pathList"].split("\n"):
index = 0
while index < len(paths):
path = paths[index]
try:
path = replace_field_values(
path, self.outputParameters, self)
@ -1100,10 +1111,15 @@ class BrowserThread(Thread):
except:
pass
if self.browser.current_url.startswith("data:"):
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
try:
self.browser.execute_script("history.go(1)") # 如果是data:开头的网址,就前进一步
except: # 超时的情况下
pass
time.sleep(2)
elements = self.browser.find_elements(By.XPATH,
xpath, iframe=node["parameters"]["iframe"])
if index > 0:
index -= 1 # 如果是data:开头的网址,就要重试一次
except NoSuchElementException:
self.print_and_log("Loop element not found: ", path)
self.print_and_log("找不到循环元素: ", path)
@ -1116,6 +1132,7 @@ class BrowserThread(Thread):
code = get_output_code(output)
if code <= 0:
break
index = index + 1
elif int(node["parameters"]["loopType"]) == 3: # 固定文本列表
textList = node["parameters"]["textList"].split("\n")
if len(textList) == 1: # 如果固定文本列表只有一行,现在就可以替换变量

Ladataan…
Peruuta
Tallenna