diff --git a/ExecuteStage/easyspider_executestage.py b/ExecuteStage/easyspider_executestage.py index 4155f9c..4c3328e 100644 --- a/ExecuteStage/easyspider_executestage.py +++ b/ExecuteStage/easyspider_executestage.py @@ -235,7 +235,7 @@ class BrowserThread(Thread): # 检测如果没有复杂的操作,优化提取数据流程 def preprocess(self): for index_node, node in enumerate(self.procedure): - parameters = node["parameters"] + parameters: dict = node["parameters"] iframe = parameters.get('iframe') option = node["option"] @@ -310,86 +310,77 @@ class BrowserThread(Thread): "a good model, such as PaddleOCR to recognize the picture, and then return " "the return value as a parameter output to the program.") param["optimizable"] = detect_optimizable(param) - elif node["option"] == 4: # 输入文字 - try: - index = node["parameters"]["index"] # 索引值 - except: - node["parameters"]["index"] = 0 - elif node["option"] == 5: # 自定义操作 - try: - clear = node["parameters"]["clear"] - except: - node["parameters"]["clear"] = 0 - try: - newLine = node["parameters"]["newLine"] - except: - node["parameters"]["newLine"] = 1 - elif node["option"] == 7: # 移动到元素 - if node["parameters"]["useLoop"]: + elif option == GraphOption.Input.value: # 输入文字 + index = parameters.get('index') + if not index: + parameters['index'] = 0 + elif option == GraphOption.Custom.value: # 自定义操作 + clear = parameters.get('clear') + if not clear: + parameters['clear'] = 0 + newLine = parameters.get('newLine') + if not newLine: + parameters['newLine'] = 1 + elif option == GraphOption.Move.value: # 移动到元素 + if parameters.get('useLoop'): if self.task_version <= "0.3.5": # 0.3.5及以下版本的EasySpider下的循环点击不支持相对XPath - node["parameters"]["xpath"] = "" - self.print_and_log("您的任务版本号为" + self.task_version + - ",循环点击不支持相对XPath写法,已自动切换为纯循环的XPath") - elif node["option"] == 8: # 循环操作 - try: - exitElement = node["parameters"]["exitElement"] - if exitElement == "": - node["parameters"]["exitElement"] = "//body" - except: - node["parameters"]["exitElement"] = "//body" - node["parameters"]["quickExtractable"] = False # 是否可以快速提取 - try: - skipCount = node["parameters"]["skipCount"] - except: - node["parameters"]["skipCount"] = 0 + parameters["xpath"] = "" + self.print_and_log(f"您的任务版本号为{self.task_version},循环点击不支持相对XPath写法,已自动切换为纯循环的XPath") + elif option == GraphOption.Loop.value: # 循环操作 + exitElement = parameters.get('exitElement') + if not exitElement or exitElement == "": + parameters['exitElement'] = "//body" + parameters["quickExtractable"] = False # 是否可以快速提取 + + skipCount = parameters.get('skipCount') + if not skipCount: + parameters['skipCount'] = 0 + # 如果(不)固定元素列表循环中只有一个提取数据操作,且提取数据操作的提取内容为元素截图,那么可以快速提取 - if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2): - try: - params = self.procedure[node["sequence"][0]]["parameters"]["params"] - except: - params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider - try: - waitElement = self.procedure[node["sequence"][0]]["parameters"]["waitElement"] - except: - waitElement = "" - if node["parameters"]["iframe"]: - node["parameters"]["quickExtractable"] = False # 如果是iframe,那么不可以快速提取 + if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 \ + and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2): + params = self.procedure[node["sequence"][0]].get("parameters").get("params") + if not params: + params = self.procedure[node["sequence"][0]]["parameters"]["paras"] # 兼容0.5.0及以下版本的EasySpider + + waitElement = self.procedure[node["sequence"][0]]["parameters"].get("waitElement", "") + + if parameters["iframe"]: + parameters["quickExtractable"] = False # 如果是iframe,那么不可以快速提取 else: - node["parameters"]["quickExtractable"] = True # 先假设可以快速提取 - if node["parameters"]["skipCount"] > 0: - node["parameters"]["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取 + parameters["quickExtractable"] = True # 先假设可以快速提取 + + if parameters["skipCount"] > 0: + parameters["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取 + for param in params: optimizable = detect_optimizable(param, ignoreWaitElement=False, waitElement=waitElement) - try: - iframe = param["iframe"] - except: - param["iframe"] = False - if param["iframe"] and not param["relative"]: # 如果是iframe,那么不可以快速提取 + iframe = param.get('iframe') + if not iframe: + param['iframe'] = False + + if param["iframe"] and not param["relative"]: # 如果是iframe,那么不可以快速提取 optimizable = False - if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取 - node["parameters"]["quickExtractable"] = False + if not optimizable: # 如果有一个不满足优化条件,那么就不能快速提取 + parameters["quickExtractable"] = False break - if node["parameters"]["quickExtractable"]: - self.print_and_log("循环操作<" + node["title"] + ">可以快速提取数据") - self.print_and_log("Loop operation <" + node["title"] + "> can extract data quickly") - try: - node["parameters"]["clear"] = self.procedure[node["sequence"][0]]["parameters"]["clear"] - except: - node["parameters"]["clear"] = 0 - try: - node["parameters"]["newLine"] = self.procedure[node["sequence"][0]]["parameters"]["newLine"] - except: - node["parameters"]["newLine"] = 1 - if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表 + + if parameters["quickExtractable"]: + self.print_and_log(f"循环操作<{node['title']}>可以快速提取数据") + self.print_and_log(f"Loop operation <{node["title"]}> can extract data quickly") + parameters["clear"] = self.procedure[node["sequence"][0]]["parameters"].get("clear", 0) + parameters["newLine"] = self.procedure[node["sequence"][0]]["parameters"].get("newLine", 1) + + if int(node["parameters"]["loopType"]) == 1: # 不固定元素列表 node["parameters"]["baseXPath"] = node["parameters"]["xpath"] - elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表 + elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表 node["parameters"]["baseXPath"] = node["parameters"]["pathList"] node["parameters"]["quickParams"] = [] for param in params: content_type = "" - if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 or param["relativeXPath"].find( - "::text()") >= 0: + if param["relativeXPath"].find("/@href") >= 0 or param["relativeXPath"].find("/text()") >= 0 \ + or param["relativeXPath"].find("::text()") >= 0: content_type = "" elif param["nodeType"] == 2: content_type = "//@href"