|
|
@ -9,7 +9,7 @@ import threading |
|
|
|
# import undetected_chromedriver as uc |
|
|
|
from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \ |
|
|
|
on_press_creator, on_release_creator, readCode, rename_downloaded_file, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json |
|
|
|
from constants import WriteMode |
|
|
|
from constants import WriteMode, DataWriteMode |
|
|
|
from myChrome import MyChrome |
|
|
|
from threading import Thread, Event |
|
|
|
from PIL import Image |
|
|
@ -154,7 +154,7 @@ class BrowserThread(Thread): |
|
|
|
self.maxViewLength = service.get("maxViewLength", 15) # 最大显示长度 |
|
|
|
self.outputFormat = service.get("outputFormat", "csv") # 输出格式 |
|
|
|
self.save_threshold = service.get("saveThreshold", 10) # 保存最低阈值 |
|
|
|
self.dataWriteMode = service.get("dataWriteMode", 1) # 数据写入模式,1为追加,2为覆盖,3为重命名文件 |
|
|
|
self.dataWriteMode = service.get("dataWriteMode", DataWriteMode.Append.value) # 数据写入模式,1为追加,2为覆盖,3为重命名文件 |
|
|
|
self.task_version = service.get("version", "") # 任务版本 |
|
|
|
|
|
|
|
if not self.task_version: |
|
|
@ -176,37 +176,40 @@ class BrowserThread(Thread): |
|
|
|
self.links = list(filter(isnotnull, service_links.split("\n"))) # 要执行的link的列表 |
|
|
|
else: |
|
|
|
self.links = list(filter(isnotnull, service["url"])) # 要执行的link |
|
|
|
|
|
|
|
self.OUTPUT = [] # 采集的数据 |
|
|
|
if self.outputFormat in ["csv", "txt", "xlsx", "json"]: |
|
|
|
if os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat): |
|
|
|
if self.dataWriteMode == 2: |
|
|
|
if self.dataWriteMode == DataWriteMode.Cover.value: |
|
|
|
os.remove("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat) |
|
|
|
elif self.dataWriteMode == 3: |
|
|
|
elif self.dataWriteMode == DataWriteMode.Rename.value: |
|
|
|
i = 2 |
|
|
|
while os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '_' + str(i) + '.' + self.outputFormat): |
|
|
|
i = i + 1 |
|
|
|
self.saveName = self.saveName + '_' + str(i) |
|
|
|
self.print_and_log("文件已存在,已重命名为", self.saveName) |
|
|
|
self.writeMode = WriteMode.Create_Mode.value # 写入模式,0为新建,1为追加 |
|
|
|
self.writeMode = WriteMode.Create.value # 写入模式,0为新建,1为追加 |
|
|
|
if self.outputFormat in ['csv', 'txt', 'xlsx']: |
|
|
|
if not os.path.exists(f"Data/Task_{str(self.id)}/{self.saveName}.{self.outputFormat}"): |
|
|
|
self.OUTPUT.append([]) # 添加表头 |
|
|
|
self.writeMode = WriteMode.Create_Mode.value |
|
|
|
self.writeMode = WriteMode.Create.value |
|
|
|
elif self.outputFormat == "json": |
|
|
|
self.writeMode = WriteMode.Json_Mode.value # JSON模式无需判断是否存在文件 |
|
|
|
self.writeMode = WriteMode.Json.value # JSON模式无需判断是否存在文件 |
|
|
|
elif self.outputFormat == "mysql": |
|
|
|
self.mysql = myMySQL(config["mysql_config_path"]) |
|
|
|
self.mysql.create_table(self.saveName, service["outputParameters"], remove_if_exists=self.dataWriteMode == 2) |
|
|
|
self.writeMode = WriteMode.MySQL_Mode.value # MySQL模式 |
|
|
|
self.mysql.create_table(self.saveName, service["outputParameters"], |
|
|
|
remove_if_exists=self.dataWriteMode == DataWriteMode.Cover.value) |
|
|
|
self.writeMode = WriteMode.MySQL.value # MySQL模式 |
|
|
|
|
|
|
|
if self.writeMode == WriteMode.Create_Mode.value: |
|
|
|
if self.writeMode == WriteMode.Create.value: |
|
|
|
self.print_and_log("新建模式|Create Mode") |
|
|
|
elif self.writeMode == WriteMode.Append_Mode.value: |
|
|
|
elif self.writeMode == WriteMode.Append.value: |
|
|
|
self.print_and_log("追加模式|Append Mode") |
|
|
|
elif self.writeMode == WriteMode.MySQL_Mode.value: |
|
|
|
elif self.writeMode == WriteMode.MySQL.value: |
|
|
|
self.print_and_log("MySQL模式|MySQL Mode") |
|
|
|
elif self.writeMode == WriteMode.Json_Mode.value: |
|
|
|
elif self.writeMode == WriteMode.Json.value: |
|
|
|
self.print_and_log("JSON模式|JSON Mode") |
|
|
|
|
|
|
|
self.containJudge = service["containJudge"] # 是否含有判断语句 |
|
|
|
self.outputParameters = {} |
|
|
|
self.service = service |
|
|
@ -222,7 +225,7 @@ class BrowserThread(Thread): |
|
|
|
self.outputParametersTypes.append(param.get("type", "text")) |
|
|
|
self.outputParametersRecord.append(bool(param.get("recordASField", True))) |
|
|
|
# 文件叠加的时候不添加表头 |
|
|
|
if self.outputFormat in ["csv", "txt", "xlsx"] and self.writeMode == WriteMode.Create_Mode.value: |
|
|
|
if self.outputFormat in ["csv", "txt", "xlsx"] and self.writeMode == WriteMode.Create.value: |
|
|
|
self.OUTPUT[0].append(param["name"]) |
|
|
|
self.urlId = 0 # 全局记录变量 |
|
|
|
self.preprocess() # 预处理,优化提取数据流程 |
|
|
|