|
|
@ -235,7 +235,7 @@ class BrowserThread(Thread): |
|
|
|
# 检测如果没有复杂的操作,优化提取数据流程 |
|
|
|
def preprocess(self): |
|
|
|
for index_node, node in enumerate(self.procedure): |
|
|
|
parameters = node["parameters"] |
|
|
|
parameters: dict = node["parameters"] |
|
|
|
iframe = parameters.get('iframe') |
|
|
|
option = node["option"] |
|
|
|
|
|
|
@ -310,86 +310,77 @@ class BrowserThread(Thread): |
|
|
|
"a good model, such as PaddleOCR to recognize the picture, and then return " |
|
|
|
"the return value as a parameter output to the program.") |
|
|
|
param["optimizable"] = detect_optimizable(param) |
|
|
|
elif node["option"] == 4: # 输入文字 |
|
|
|
try: |
|
|
|
index = node["parameters"]["index"] # 索引值 |
|
|
|
except: |
|
|
|
node["parameters"]["index"] = 0 |
|
|
|
elif node["option"] == 5: # 自定义操作 |
|
|
|
try: |
|
|
|
clear = node["parameters"]["clear"] |
|
|
|
except: |
|
|
|
node["parameters"]["clear"] = 0 |
|
|
|
try: |
|
|
|
newLine = node["parameters"]["newLine"] |
|
|
|
except: |
|
|
|
node["parameters"]["newLine"] = 1 |
|
|
|
elif node["option"] == 7: # 移动到元素 |
|
|
|
if node["parameters"]["useLoop"]: |
|
|
|
elif option == GraphOption.Input.value: # 输入文字 |
|
|
|
index = parameters.get('index') |
|
|
|
if not index: |
|
|
|
parameters['index'] = 0 |
|
|
|
elif option == GraphOption.Custom.value: # 自定义操作 |
|
|
|
clear = parameters.get('clear') |
|
|
|
if not clear: |
|
|
|
parameters['clear'] = 0 |
|
|
|
newLine = parameters.get('newLine') |
|
|
|
if not newLine: |
|
|
|
parameters['newLine'] = 1 |
|
|
|
elif option == GraphOption.Move.value: # 移动到元素 |
|
|
|
if parameters.get('useLoop'): |
|
|
|
if self.task_version <= "0.3.5": |
|
|
|
# 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath |
|
|
|
node["parameters"]["xpath"] = "" |
|
|
|
self.print_and_log("您的任务版本号为" + self.task_version + |
|
|
|
",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath") |
|
|
|
elif node["option"] == 8: # 循环操作 |
|
|
|
try: |
|
|
|
exitElement = node["parameters"]["exitElement"] |
|
|
|
if exitElement == "": |
|
|
|
node["parameters"]["exitElement"] = "//body" |
|
|
|
except: |
|
|
|
node["parameters"]["exitElement"] = "//body" |
|
|
|
node["parameters"]["quickExtractable"] = False # 是否可以快速提取 |
|
|
|
try: |
|
|
|
skipCount = node["parameters"]["skipCount"] |
|
|
|
except: |
|
|
|
node["parameters"]["skipCount"] = 0 |
|
|
|
parameters["xpath"] = "" |
|
|
|
self.print_and_log(f"您的任务版本号为{self.task_version},循环点击不支持相对XPath写法,已自动切换为纯循环的XPath") |
|
|
|
elif option == GraphOption.Loop.value: # 循环操作 |
|
|
|
exitElement = parameters.get('exitElement') |
|
|
|
if not exitElement or exitElement == "": |
|
|
|
parameters['exitElement'] = "//body" |
|
|
|
parameters["quickExtractable"] = False # 是否可以快速提取 |
|
|
|
|
|
|
|
skipCount = parameters.get('skipCount') |
|
|
|
if not skipCount: |
|
|
|
parameters['skipCount'] = 0 |
|
|
|
|
|
|
|
# 如果(不)固定元素列表循环中只有一个提取数据操作,且提取数据操作的提取内容为元素截图,那么可以快速提取 |
|
|
|
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2): |
|
|
|
try: |
|
|
|
params = self.procedure[node["sequence"][0]]["parameters"]["params"] |
|
|
|
except: |
|
|
|
params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider |
|
|
|
try: |
|
|
|
waitElement = self.procedure[node["sequence"][0]]["parameters"]["waitElement"] |
|
|
|
except: |
|
|
|
waitElement = "" |
|
|
|
if node["parameters"]["iframe"]: |
|
|
|
node["parameters"]["quickExtractable"] = False # 如果是iframe,那么不可以快速提取 |
|
|
|
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 \ |
|
|
|
and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2): |
|
|
|
params = self.procedure[node["sequence"][0]].get("parameters").get("params") |
|
|
|
if not params: |
|
|
|
params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider |
|
|
|
|
|
|
|
waitElement = self.procedure[node["sequence"][0]]["parameters"].get("waitElement", "") |
|
|
|
|
|
|
|
if parameters["iframe"]: |
|
|
|
parameters["quickExtractable"] = False # 如果是iframe,那么不可以快速提取 |
|
|
|
else: |
|
|
|
node["parameters"]["quickExtractable"] = True # 先假设可以快速提取 |
|
|
|
if node["parameters"]["skipCount"] > 0: |
|
|
|
node["parameters"]["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取 |
|
|
|
parameters["quickExtractable"] = True # 先假设可以快速提取 |
|
|
|
|
|
|
|
if parameters["skipCount"] > 0: |
|
|
|
parameters["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取 |
|
|
|
|
|
|
|
for param in params: |
|
|
|
optimizable = detect_optimizable(param, ignoreWaitElement=False, waitElement=waitElement) |
|
|
|
try: |
|
|
|
iframe = param["iframe"] |
|
|
|
except: |
|
|
|
param["iframe"] = False |
|
|
|
if param["iframe"] and not param["relative"]: # 如果是iframe,那么不可以快速提取 |
|
|
|
iframe = param.get('iframe') |
|
|
|
if not iframe: |
|
|
|
param['iframe'] = False |
|
|
|
|
|
|
|
if param["iframe"] and not param["relative"]: # 如果是iframe,那么不可以快速提取 |
|
|
|
optimizable = False |
|
|
|
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取 |
|
|
|
node["parameters"]["quickExtractable"] = False |
|
|
|
if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取 |
|
|
|
parameters["quickExtractable"] = False |
|
|
|
break |
|
|
|
if node["parameters"]["quickExtractable"]: |
|
|
|
self.print_and_log("循环操作<" + node["title"] + ">可以快速提取数据") |
|
|
|
self.print_and_log("Loop operation <" + node["title"] + "> can extract data quickly") |
|
|
|
try: |
|
|
|
node["parameters"]["clear"] = self.procedure[node["sequence"][0]]["parameters"]["clear"] |
|
|
|
except: |
|
|
|
node["parameters"]["clear"] = 0 |
|
|
|
try: |
|
|
|
node["parameters"]["newLine"] = self.procedure[node["sequence"][0]]["parameters"]["newLine"] |
|
|
|
except: |
|
|
|
node["parameters"]["newLine"] = 1 |
|
|
|
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表 |
|
|
|
|
|
|
|
if parameters["quickExtractable"]: |
|
|
|
self.print_and_log(f"循环操作<{node['title']}>可以快速提取数据") |
|
|
|
self.print_and_log(f"Loop operation <{node["title"]}> can extract data quickly") |
|
|
|
parameters["clear"] = self.procedure[node["sequence"][0]]["parameters"].get("clear", 0) |
|
|
|
parameters["newLine"] = self.procedure[node["sequence"][0]]["parameters"].get("newLine", 1) |
|
|
|
|
|
|
|
if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表 |
|
|
|
node["parameters"]["baseXPath"] = node["parameters"]["xpath"] |
|
|
|
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表 |
|
|
|
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表 |
|
|
|
node["parameters"]["baseXPath"] = node["parameters"]["pathList"] |
|
|
|
node["parameters"]["quickParams"] = [] |
|
|
|
for param in params: |
|
|
|
content_type = "" |
|
|
|
if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 or param["relativeXPath"].find( |
|
|
|
"::text()") >= 0: |
|
|
|
if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 \ |
|
|
|
or param["relativeXPath"].find("::text()") >= 0: |
|
|
|
content_type = "" |
|
|
|
elif param["nodeType"] == 2: |
|
|
|
content_type = "//@href" |
|
|
|