Sfoglia il codice sorgente

Official Version of 0.6.0

pull/254/head
naibo 9 mesi fa
parent
commit
ed0768ca51
49 ha cambiato i file con 298 aggiunte e 69 eliminazioni
  1. +118
    -28
      .temp_to_pub/EasySpider_windows_x64/Code/easyspider_executestage.py
  2. +151
    -1
      .temp_to_pub/EasySpider_windows_x64/Code/utils.py
  3. +1
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/0.json
  4. +1
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/1.json
  5. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/10.json
  6. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/11.json
  7. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/12.json
  8. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/13.json
  9. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/14.json
  10. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/15.json
  11. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/16.json
  12. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/17.json
  13. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/18.json
  14. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/19.json
  15. +1
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/2.json
  16. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/20.json
  17. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/21.json
  18. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/22.json
  19. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/23.json
  20. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/24.json
  21. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/25.json
  22. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/26.json
  23. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/27.json
  24. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/28.json
  25. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/3.json
  26. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/4.json
  27. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/5.json
  28. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/6.json
  29. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/7.json
  30. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/8.json
  31. +0
    -1
      .temp_to_pub/EasySpider_windows_x64/execution_instances/9.json
  32. +1
    -1
      .temp_to_pub/EasySpider_windows_x64/tasks/149.json
  33. +1
    -1
      .temp_to_pub/EasySpider_windows_x64/tasks/213.json
  34. +1
    -1
      .temp_to_pub/EasySpider_windows_x64/tasks/296.json
  35. +1
    -1
      .temp_to_pub/EasySpider_windows_x64/tasks/8.json
  36. +1
    -0
      .temp_to_pub/compress.py
  37. BIN
      ElectronJS/EasySpider_en.crx
  38. BIN
      ElectronJS/EasySpider_zh.crx
  39. +1
    -0
      ElectronJS/clean_and_release_win32.cmd
  40. +1
    -0
      ElectronJS/clean_and_release_win64.cmd
  41. +7
    -1
      ElectronJS/main.js
  42. +1
    -0
      ElectronJS/package_linux64.sh
  43. +1
    -0
      ElectronJS/package_macos.sh
  44. +1
    -1
      ElectronJS/src/taskGrid/FlowChart.html
  45. +1
    -1
      ElectronJS/src/taskGrid/FlowChart_CN.html
  46. +1
    -1
      ElectronJS/src/taskGrid/executeTask.html
  47. +1
    -1
      ExecuteStage/.vscode/launch.json
  48. +2
    -2
      ExecuteStage/easyspider_executestage.py
  49. +4
    -0
      ExecuteStage/utils.py

+ 118
- 28
.temp_to_pub/EasySpider_windows_x64/Code/easyspider_executestage.py Vedi File

@ -6,8 +6,8 @@ import platform
import shutil
import string
import undetected_chromedriver as uc
from utils import detect_optimizable, download_image, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
on_press_creator, on_release_creator, readCode, replace_field_values, send_email, write_to_csv, write_to_excel, write_to_json
from utils import detect_optimizable, download_image, extract_text_from_html, get_output_code, isnotnull, lowercase_tags_in_xpath, myMySQL, new_line, \
on_press_creator, on_release_creator, readCode, replace_field_values, send_email, split_text_by_lines, write_to_csv, write_to_excel, write_to_json
from myChrome import MyChrome
from threading import Thread, Event
from PIL import Image
@ -47,10 +47,11 @@ import requests
from ddddocr import DdddOcr
from urllib.parse import urljoin
from lxml import etree, html
import onnxruntime
onnxruntime.set_default_logger_severity(3) # 隐藏onnxruntime的日志
# import pandas as pd
import pandas as pd
# import numpy
# import pytesseract
# import uuid
@ -295,9 +296,13 @@ class BrowserThread(Thread):
except:
pass
try:
node["parameters"]["recordASField"] += param["recordASField"]
node["parameters"]["recordASField"] = param["recordASField"]
except:
node["parameters"]["recordASField"] = 1
try:
splitLine = int(param["splitLine"])
except:
node["parameters"]["recordASField"] += 1
param["splitLine"] = 0
if param["contentType"] == 8:
self.print_and_log(
"默认的ddddocr识别功能如果觉得不好用,可以自行修改源码get_content函数->contentType == 8的位置换成自己想要的OCR模型然后自己编译运行;或者可以先设置采集内容类型为“元素截图”把图片保存下来,然后用自定义操作调用自己写的程序,程序的功能是读取这个最新生成的图片,然后用好用的模型,如PaddleOCR把图片识别出来,然后把返回值返回给程序作为参数输出。")
@ -333,6 +338,10 @@ class BrowserThread(Thread):
except:
node["parameters"]["exitElement"] = "//body"
node["parameters"]["quickExtractable"] = False # 是否可以快速提取
try:
skipCount = node["parameters"]["skipCount"]
except:
node["parameters"]["skipCount"] = 0
# 如果(不)固定元素列表循环中只有一个提取数据操作,且提取数据操作的提取内容为元素截图,那么可以快速提取
if len(node["sequence"]) == 1 and self.procedure[node["sequence"][0]]["option"] == 3 and (int(node["parameters"]["loopType"]) == 1 or int(node["parameters"]["loopType"]) == 2):
try:
@ -347,6 +356,8 @@ class BrowserThread(Thread):
node["parameters"]["quickExtractable"] = False # 如果是iframe,那么不可以快速提取
else:
node["parameters"]["quickExtractable"] = True # 先假设可以快速提取
if node["parameters"]["skipCount"] > 0:
node["parameters"]["quickExtractable"] = False # 如果有跳过的元素,那么不可以快速提取
for param in params:
optimizable = detect_optimizable(param, ignoreWaitElement=False, waitElement=waitElement)
try:
@ -463,21 +474,51 @@ class BrowserThread(Thread):
self.print_and_log(
"Already read input parameters from Excel and overwrite the original input parameters.")
def removeDuplicateData(self):
try:
removeDuplicateData = self.service["removeDuplicate"]
except:
removeDuplicateData = 0
if removeDuplicateData == 1:
self.print_and_log("正在去除重复数据,请稍后……")
self.print_and_log("Removing duplicate data, please wait...")
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "json" or self.outputFormat == "xlsx":
file_name = "Data/Task_" + \
str(self.id) + "/" + self.saveName + \
'.' + self.outputFormat
if self.outputFormat == "csv" or self.outputFormat == "txt":
df = pd.read_csv(file_name)
df.drop_duplicates(inplace=True)
df.to_csv(file_name, index=False)
elif self.outputFormat == "xlsx":
df = pd.read_excel(file_name)
df.drop_duplicates(inplace=True)
df.to_excel(file_name, index=False)
elif self.outputFormat == "json":
df = pd.read_json(file_name)
df.drop_duplicates(inplace=True)
df.to_json(file_name, orient="records", force_ascii=False)
elif self.outputFormat == "mysql":
self.mysql.remove_duplicate_data()
self.print_and_log("去重完成。")
self.print_and_log("Duplicate data removed.")
def run(self):
# 挨个执行程序
for i in range(len(self.links)):
self.print_and_log("正在执行第", i + 1, "/ ", len(self.links), "个链接")
self.print_and_log("正在执行第", i + 1, "/", len(self.links), "个链接")
self.print_and_log("Executing link", i + 1,
"/ ", len(self.links))
"/", len(self.links))
self.executeNode(0)
self.urlId = self.urlId + 1
files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
# 如果目录为空,则删除该目录
if not files:
os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
# if not files:
# os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
self.print_and_log("Done!")
self.print_and_log("执行完成!")
self.saveData(exit=True)
self.removeDuplicateData()
if self.outputFormat == "mysql":
self.mysql.close()
try:
@ -1115,10 +1156,18 @@ class BrowserThread(Thread):
if node["parameters"]["exitCount"] == 0:
# newBodyText = self.browser.find_element(By.XPATH, node["parameters"]["exitElement"], iframe=node["parameters"]["iframe"]).text
# 用find_elements获取所有匹配到的文本
exitElements = self.browser.find_elements(By.XPATH, node["parameters"]["exitElement"], iframe=node["parameters"]["iframe"])
newBodyText = ""
for exitElement in exitElements:
newBodyText += exitElement.text
try:
exitElements = self.browser.find_elements(By.XPATH, node["parameters"]["exitElement"], iframe=node["parameters"]["iframe"])
newBodyText = ""
for exitElement in exitElements:
newBodyText += exitElement.text
except Exception as e:
self.print_and_log(f"设定的退出循环元素:{node['parameters']['exitElement']}的文本无法获取,本次循环将不再检测元素文本是否变化,将会继续执行,为解决此问题,您可以修改检测元素文本不变的元素为其他元素,或者将循环次数设定为固定次数大于0的值。")
self.print_and_log(f"The text of the exit loop element set: {node['parameters']['exitElement']} cannot be obtained, this loop will no longer check whether the text of the element has changed, and will continue to execute. To solve this problem, you can modify the element whose text does not change to other elements, or set the number of loops to a fixed number greater than 0.")
self.print_and_log(e)
exitElements = []
# newBodyText为随机文本,保证一直执行
newBodyText = str(random.random())
if node["parameters"]["iframe"]: # 如果标记了iframe
iframes = self.browser.find_elements(
By.CSS_SELECTOR, "iframe", iframe=False)
@ -1200,9 +1249,15 @@ class BrowserThread(Thread):
if len(elements) == 0:
self.print_and_log("Loop element not found: ",
xpath)
self.print_and_log("找不到循环元素: ", xpath)
self.print_and_log("找不到循环元素", xpath)
index = 0
skipCount = node["parameters"]["skipCount"]
while index < len(elements):
if index < skipCount:
index += 1
self.print_and_log("跳过第" + str(index) + "个元素")
self.print_and_log("Skip the " + str(index) + "th element")
continue
try:
element = elements[index]
element_text = element.text
@ -1250,7 +1305,7 @@ class BrowserThread(Thread):
index = index + 1
except NoSuchElementException:
self.print_and_log("Loop element not found: ", xpath)
self.print_and_log("找不到循环元素: ", xpath)
self.print_and_log("找不到循环元素", xpath)
except Exception as e:
raise
elif int(node["parameters"]["loopType"]) == 2: # 固定元素列表
@ -1258,7 +1313,13 @@ class BrowserThread(Thread):
paths = node["parameters"]["pathList"].split("\n")
# for path in node["parameters"]["pathList"].split("\n"):
index = 0
skipCount = node["parameters"]["skipCount"]
while index < len(paths):
if index < skipCount:
index += 1
self.print_and_log("跳过第" + str(index) + "个元素")
self.print_and_log("Skip the " + str(index) + "th element")
continue
path = paths[index]
try:
path = replace_field_values(
@ -1295,7 +1356,7 @@ class BrowserThread(Thread):
index, element = self.handleHistory(node, path, thisHistoryURL, thisHistoryLength, index, element=element)
except NoSuchElementException:
self.print_and_log("Loop element not found: ", path)
self.print_and_log("找不到循环元素: ", path)
self.print_and_log("找不到循环元素", path)
index += 1
continue # 循环中找不到元素就略过操作
except Exception as e:
@ -1314,7 +1375,14 @@ class BrowserThread(Thread):
if len(textList) == 1: # 如果固定文本列表只有一行,现在就可以替换变量
textList = replace_field_values(
node["parameters"]["textList"], self.outputParameters, self).split("\n")
skipCount = node["parameters"]["skipCount"]
index = 0
for text in textList:
if index < skipCount:
index += 1
self.print_and_log("跳过第" + str(index) + "个文本")
self.print_and_log("Skip the " + str(index) + "th text")
continue
text = replace_field_values(text, self.outputParameters, self)
# self.recordLog("当前循环文本|Current loop text:", text)
for i in node["sequence"]: # 挨个执行操作
@ -1340,11 +1408,14 @@ class BrowserThread(Thread):
if len(urlList) == 1: # 如果固定网址列表只有一行,现在就可以替换变量
urlList = replace_field_values(
node["parameters"]["textList"], self.outputParameters, self).split("\n")
# urlList = []
# for url in tempList:
# if url != "":
# urlList.append(url)
skipCount = node["parameters"]["skipCount"]
index = 0
for url in urlList:
if index < skipCount:
index += 1
self.print_and_log("跳过第" + str(index) + "个网址")
self.print_and_log("Skip the " + str(index) + "th url")
continue
url = replace_field_values(url, self.outputParameters, self)
# self.recordLog("当前循环网址|Current loop url:", url)
for i in node["sequence"]:
@ -1392,7 +1463,7 @@ class BrowserThread(Thread):
self.history["handle"] = self.browser.current_window_handle
self.scrollDown(node["parameters"])
# 打开网页事件
# 打开网页操作
def openPage(self, param, loopValue):
time.sleep(1) # 打开网页后强行等待至少1秒
if len(self.browser.window_handles) > 1:
@ -1457,7 +1528,7 @@ class BrowserThread(Thread):
self.history["index"] = 0
self.scrollDown(param) # 控制屏幕向下滚动
# 键盘输入事件
# 键盘输入操作
def inputInfo(self, param, loopValue):
time.sleep(0.1) # 输入之前等待0.1秒
try:
@ -1509,7 +1580,7 @@ class BrowserThread(Thread):
xpath + ", please try to set the wait time before executing this operation")
self.print_and_log("找不到输入框元素:" + xpath + ",请尝试在执行此操作前设置等待时间")
# 点击元素事件
# 点击元素操作
def clickElement(self, param, loopElement=None, clickPath="", index=0):
try:
maxWaitTime = int(param["maxWaitTime"])
@ -1525,7 +1596,10 @@ class BrowserThread(Thread):
clickPath, self.outputParameters, self)
xpath = replace_field_values(
param["xpath"], self.outputParameters, self)
if param["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
if xpath.find("point(") >= 0: # 如果xpath中包含point(),说明是相对坐标的点击
index = 0
path = "//body"
elif param["useLoop"]: # 使用循环的情况下,传入的clickPath就是实际的xpath
if xpath == "":
path = clickPath
else:
@ -1557,9 +1631,21 @@ class BrowserThread(Thread):
try:
newTab = int(param["newTab"])
except:
newTab = 1
newTab = 0
try:
if click_way == 0: # 用selenium的点击方法
if xpath.find("point(") >= 0: # 如果xpath中包含point(),说明是相对坐标的点击
point = xpath.split("point(")[1].split(")")[0].split(",")
x = int(point[0])
y = int(point[1])
# try:
# actions = ActionChains(self.browser) # 实例化一个action对象
# actions.move_to_element(element).perform()
# actions.move_by_offset(x, y).perform()
# actions.click().perform()
# except Exception as e:
script = "document.elementFromPoint(" + str(x) + "," + str(y) + ").click();"
self.browser.execute_script(script)
elif click_way == 0: # 用selenium的点击方法
try:
actions = ActionChains(self.browser) # 实例化一个action对象
if newTab == 1: # 在新标签页打开
@ -1693,7 +1779,11 @@ class BrowserThread(Thread):
download_image(self, content, "Data/Task_" +
str(self.id) + "/" + self.saveName + "/", element)
else: # 普通节点
content = element.text
if p["splitLine"] == 1:
text = extract_text_from_html(element.get_attribute('outerHTML'))
content = split_text_by_lines(text)
else:
content = element.text
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
@ -1830,7 +1920,7 @@ class BrowserThread(Thread):
self.outputParameters[key] = ""
self.recordLog("清空输出参数|Clear output parameters")
# 提取数据事件
# 提取数据操作
def getData(self, param, loopElement, isInLoop=True, parentPath="", index=0):
parentPath = replace_field_values(
parentPath, self.outputParameters, self)

+ 151
- 1
.temp_to_pub/EasySpider_windows_x64/Code/utils.py Vedi File

@ -7,8 +7,11 @@ import sys
import re
import time
import uuid
from bs4 import BeautifulSoup
# import keyboard
from openpyxl import Workbook, load_workbook
# import pandas as pd
# import xlsxwriter
import requests
from urllib.parse import urlparse
import pymysql
@ -69,6 +72,22 @@ def is_valid_url(url):
def lowercase_tags_in_xpath(xpath):
return re.sub(r"([A-Z]+)(?=[\[\]//]|$)", lambda x: x.group(0).lower(), xpath)
# 提取HTML中的文本内容
def extract_text_from_html(html_content):
soup = BeautifulSoup(html_content, 'lxml') # 使用lxml作为解析器
for script in soup(["script", "style"]): # 去除脚本和样式内容
script.extract()
for p_tag in soup.find_all("p"):
p_tag.append(soup.new_tag("br")) # 在每个p标签后添加br标签
p_tag.append("\n") # 在每个p标签后添加换行符
text = soup.get_text()
return text
# 将文本按照行分割并去除额外空白
def split_text_by_lines(text):
lines = text.splitlines()
lines = [line.strip() for line in lines if line.strip()] # 去除空行和首尾空格
return "\n".join(lines)
def on_press_creator(press_time, event):
def on_press(key):
@ -137,7 +156,11 @@ def on_release_creator(event, press_time):
# time.sleep(1) # 每秒检查一次
def detect_optimizable(param, ignoreWaitElement=True, waitElement=""):
if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1:
try:
splitLine = param["splitLine"]
except:
param["splitLine"] = 0
if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1 and param["splitLine"] == 0:
if param["nodeType"] <= 2:
if ignoreWaitElement or waitElement == "":
return True
@ -336,11 +359,115 @@ def write_to_json(file_name, data, types, record, keys):
def write_to_excel(file_name, data, types, record):
# 首先,检查文件是否存在来决定是否处理第一行
# first = not os.path.exists(file_name)
# # 准备新数据
# new_data = pd.DataFrame(data)
# # 如果不是第一行(即文件已存在),对数据应用类型转换
# if not first:
# for i, col_type in enumerate(types):
# if col_type == "int" or col_type == "bigInt":
# try:
# new_data[i] = pd.to_numeric(new_data[i], errors='coerce').astype(int)
# except:
# new_data[i] = pd.to_numeric("0", errors='coerce').astype(int)
# elif col_type == "double":
# try:
# new_data[i] = pd.to_numeric(new_data[i], errors='coerce')(0.0)
# except:
# new_data[i] = pd.to_numeric("0.0", errors='coerce').astype(float)
# # 根据 record 筛选列
# new_data = new_data.loc[:, record]
# # 如果文件存在,则读取现有数据并追加新数据
# if first:
# combined_data = new_data
# else:
# # 使用 Pandas 读取现有数据
# existing_data = pd.read_excel(file_name)
# # 合并现有数据与新数据
# combined_data = pd.concat([existing_data, new_data], ignore_index=True)
# # 将合并后的数据写入 Excel
# combined_data.to_excel(file_name, index=False, engine='openpyxl')
# existing_data = []
# first = True
# # 检查文件是否存在
# if os.path.exists(file_name):
# # 使用 openpyxl 读取现有数据
# workbook = load_workbook(file_name, read_only=True)
# sheet = workbook.active
# # 读取已有行数
# num_rows = sheet.max_row
# if num_rows > 5000:
# print("Excel文件中的数据行数超过5000行,过多的行数将会导致追加模式写入数据速度变慢,建议更换为CSV文件或MySQL数据库存储数据。正在读取数据,请稍等...")
# print("The number of rows in the Excel file exceeds 5000, too many rows will cause the speed of writing data in append mode to slow down, it is recommended to replace it with CSV file or MySQL database to store data. Reading data, please wait...")
# # existing_data = [[sheet.cell(row=i, column=j).value for j in range(1, sheet.max_column + 1)] for i in range(1, sheet.max_row + 1)]
# for i in range(1, sheet.max_row + 1):
# row_data = []
# if num_rows > 5000 and i % 500 == 0:
# print(f"正在读取第{i}/{num_rows}行的数据...")
# print(f"Reading data of row {i}/{num_rows}...")
# for j in range(1, sheet.max_column + 1):
# cell = sheet.cell(row=i, column=j).value
# if cell is None:
# cell = ""
# row_data.append(cell)
# existing_data.append(row_data)
# first = False # 如果文件存在,首行不再是标题行
# # 使用 xlsxwriter 创建新文件
# workbook = xlsxwriter.Workbook(file_name)
# worksheet = workbook.add_worksheet()
# # 写入现有数据
# for row_num, row_data in enumerate(existing_data):
# for col_num, cell in enumerate(row_data):
# worksheet.write(row_num, col_num, cell)
# # 写入新数据
# row = len(existing_data)
# for line in data:
# to_write = []
# for i in range(len(line)):
# value = line[i]
# if not first: # 如果不是第一行,需要转换数据类型
# if types[i] == "int" or types[i] == "bigInt":
# try:
# value = int(value)
# except ValueError:
# value = 0
# elif types[i] == "double":
# try:
# value = float(value)
# except ValueError:
# value = 0.0
# if record[i]:
# to_write.append(value)
# first = False # 更新 first 以跳过数据类型转换
# for col, item in enumerate(to_write):
# worksheet.write(row, col, item)
# row += 1
# # 关闭工作簿
# workbook.close()
first = False
if os.path.exists(file_name):
# 加载现有的工作簿
wb = load_workbook(file_name)
# 行数读取
num_rows = wb.active.max_row
if num_rows > 1000:
print("Excel文件中的数据行数已超过1000行,过多的行数将会导致追加模式写入数据速度变慢,建议增大任务保存对话框中的“每采集多少条数据保存一次”选项的值以提升采集速度,或者更换为CSV文件或MySQL数据库存储数据。正在读取数据,请稍等...")
print("The number of rows in the Excel file already exceeds 1000, too many rows will cause the speed of writing data in append mode to slow down, it is recommended to increase the value of the 'Save every how many data' option in the task save dialog to improve the collection speed, or replace it with CSV file or MySQL database to store data. Reading data, please wait...")
ws = wb.active
if num_rows > 1000:
print("读取数据完成,正在追加数据...")
print("Reading data completed, appending data...")
else:
# 创建新的工作簿和工作表
wb = Workbook()
@ -433,6 +560,10 @@ class myMySQL:
sql = "CREATE TABLE " + table_name + \
" (_id INT AUTO_INCREMENT PRIMARY KEY, "
for item in parameters:
try:
recordASField = item["recordASField"]
except:
item["recordASField"] = True
if item["recordASField"]:
name = item['name']
if item['type'] == 'int':
@ -546,6 +677,25 @@ class myMySQL:
# 关闭游标和连接
self.cursor.close()
def remove_duplicate_data(self):
self.cursor = self.conn.cursor()
# 删除重复数据
fields = self.field_sql.replace("(", "").replace(")", "")
sql = f"CREATE TABLE {self.table_name}_temp AS " + \
f"SELECT MIN(_id) AS _id, " + fields + \
f" FROM {self.table_name} GROUP BY " + fields + ";"
self.cursor.execute(sql)
sql = f"DELETE FROM {self.table_name};"
self.cursor.execute(sql)
sql = f"INSERT INTO {self.table_name} SELECT * FROM {self.table_name}_temp;"
self.cursor.execute(sql)
sql = f"DROP TABLE {self.table_name}_temp;"
self.cursor.execute(sql)
# 提交到数据库执行
self.conn.commit()
# 关闭游标和连接
self.cursor.close()
def close(self):
try:
self.conn.close()

+ 1
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/0.json
File diff soppresso perché troppo grande
Vedi File


+ 1
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/1.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/10.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/11.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/12.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/13.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/14.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/15.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/16.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/17.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/18.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/19.json Vedi File

@ -1 +0,0 @@
{"id":19,"name":"","url":"https://t.zsxq.com/15aUTk4Oa","links":"https://t.zsxq.com/15aUTk4Oa","create_time":"12/17/2023, 12:12:12 PM","update_time":"12/17/2023, 12:14:27 PM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","desc":"https://t.zsxq.com/15aUTk4Oa","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://t.zsxq.com/15aUTk4Oa","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://t.zsxq.com/15aUTk4Oa"}],"outputParameters":[{"id":0,"name":"执行JavaScript","desc":"自定义操作返回的数据","type":"text","recordASField":0,"exampleValue":""}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,3,4],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://t.zsxq.com/15aUTk4Oa","links":"https://t.zsxq.com/15aUTk4Oa","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":-1,"index":2,"parentId":0,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":0,"maxWaitTime":10,"params":[],"alertHandleType":0}},{"id":2,"index":3,"parentId":0,"type":0,"option":2,"title":"点击2023程序...","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":15,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"main-content-container\")]/app-topic[3]/div[1]/div[1]/div[1]/app-talk-content[1]/div[1]/app-file-gallery[1]/div[1]/div[2]/div[2]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/app-root[1]/app-index[1]/div[1]/app-topic-flow[1]/div[1]/app-main-content[1]/div[1]/app-topic[3]/div[1]/div[1]/div[1]/app-talk-content[1]/div[1]/app-file-gallery[1]/div[1]/div[2]/div[2]","//div[contains(., '2023程序员人群洞')]","//DIV[@class='file-name']","/html/body/app-root/app-index/div/app-topic-flow/div/app-main-content/div/app-topic[last()-17]/div/div/div[last()-1]/app-talk-content/div/app-file-gallery/div/div/div"]}},{"id":3,"index":4,"parentId":0,"type":0,"option":5,"title":"执行JavaScript","sequence":[],"isInLoop":false,"position":2,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"codeMode":0,"code":"document.elementFromPoint(20,20).click();","waitTime":0,"recordASField":0,"paraType":"text","emailConfig":{"host":"","port":465,"username":"","password":"","from":"","to":"","subject":"","content":""}}}]}

+ 1
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/2.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/20.json Vedi File

@ -1 +0,0 @@
{"id":20,"name":"","url":"https://t.zsxq.com/15aUTk4Oa","links":"https://www.zsxq.com","create_time":"12/17/2023, 12:12:12 PM","update_time":"12/17/2023, 12:14:27 PM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","desc":"https://t.zsxq.com/15aUTk4Oa","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://t.zsxq.com/15aUTk4Oa","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://t.zsxq.com/15aUTk4Oa"}],"outputParameters":[{"id":0,"name":"执行JavaScript","desc":"自定义操作返回的数据","type":"text","recordASField":0,"exampleValue":""}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,3,4],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://t.zsxq.com/15aUTk4Oa","links":"https://www.zsxq.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":-1,"index":2,"parentId":0,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":0,"maxWaitTime":10,"params":[],"alertHandleType":0}},{"id":2,"index":3,"parentId":0,"type":0,"option":2,"title":"点击2023程序...","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":15,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"main-content-container\")]/app-topic[3]/div[1]/div[1]/div[1]/app-talk-content[1]/div[1]/app-file-gallery[1]/div[1]/div[2]/div[2]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/app-root[1]/app-index[1]/div[1]/app-topic-flow[1]/div[1]/app-main-content[1]/div[1]/app-topic[3]/div[1]/div[1]/div[1]/app-talk-content[1]/div[1]/app-file-gallery[1]/div[1]/div[2]/div[2]","//div[contains(., '2023程序员人群洞')]","//DIV[@class='file-name']","/html/body/app-root/app-index/div/app-topic-flow/div/app-main-content/div/app-topic[last()-17]/div/div/div[last()-1]/app-talk-content/div/app-file-gallery/div/div/div"]}},{"id":3,"index":4,"parentId":0,"type":0,"option":5,"title":"执行JavaScript","sequence":[],"isInLoop":false,"position":2,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"codeMode":0,"code":"document.elementFromPoint(20,20).click();","waitTime":0,"recordASField":0,"paraType":"text","emailConfig":{"host":"","port":465,"username":"","password":"","from":"","to":"","subject":"","content":""}}}]}

+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/21.json Vedi File

@ -1 +0,0 @@
{"id":21,"name":"","url":"https://t.zsxq.com/15aUTk4Oa","links":"https://www.zsxq.com","create_time":"12/17/2023, 12:12:12 PM","update_time":"12/17/2023, 12:17:41 PM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","desc":"https://t.zsxq.com/15aUTk4Oa","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.zsxq.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.zsxq.com"}],"outputParameters":[{"id":0,"name":"执行JavaScript","desc":"自定义操作返回的数据","type":"text","recordASField":0,"exampleValue":""}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,3,4],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://t.zsxq.com/15aUTk4Oa","links":"https://www.zsxq.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":-1,"index":2,"parentId":0,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":0,"maxWaitTime":10,"params":[],"alertHandleType":0}},{"id":2,"index":3,"parentId":0,"type":0,"option":2,"title":"点击2023程序...","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":15,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"main-content-container\")]/app-topic[3]/div[1]/div[1]/div[1]/app-talk-content[1]/div[1]/app-file-gallery[1]/div[1]/div[2]/div[2]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/app-root[1]/app-index[1]/div[1]/app-topic-flow[1]/div[1]/app-main-content[1]/div[1]/app-topic[3]/div[1]/div[1]/div[1]/app-talk-content[1]/div[1]/app-file-gallery[1]/div[1]/div[2]/div[2]","//div[contains(., '2023程序员人群洞')]","//DIV[@class='file-name']","/html/body/app-root/app-index/div/app-topic-flow/div/app-main-content/div/app-topic[last()-17]/div/div/div[last()-1]/app-talk-content/div/app-file-gallery/div/div/div"]}},{"id":3,"index":4,"parentId":0,"type":0,"option":5,"title":"执行JavaScript","sequence":[],"isInLoop":false,"position":2,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"codeMode":0,"code":"document.elementFromPoint(20,20).click();","waitTime":0,"recordASField":0,"paraType":"text","emailConfig":{"host":"","port":465,"username":"","password":"","from":"","to":"","subject":"","content":""}}}]}

+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/22.json Vedi File

@ -1 +0,0 @@
{"id":22,"name":"","url":"https://t.zsxq.com/15aUTk4Oa","links":"https://wx.zsxq.com/dweb2","create_time":"12/17/2023, 12:12:12 PM","update_time":"12/17/2023, 12:18:23 PM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","desc":"https://t.zsxq.com/15aUTk4Oa","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://wx.zsxq.com/dweb2","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://wx.zsxq.com/dweb2"}],"outputParameters":[{"id":0,"name":"执行JavaScript","desc":"自定义操作返回的数据","type":"text","recordASField":0,"exampleValue":""}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,3,4],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://t.zsxq.com/15aUTk4Oa","links":"https://wx.zsxq.com/dweb2","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":-1,"index":2,"parentId":0,"type":0,"option":2,"title":"点击元素","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":0,"maxWaitTime":10,"params":[],"alertHandleType":0}},{"id":2,"index":3,"parentId":0,"type":0,"option":2,"title":"点击2023程序...","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":15,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"main-content-container\")]/app-topic[3]/div[1]/div[1]/div[1]/app-talk-content[1]/div[1]/app-file-gallery[1]/div[1]/div[2]/div[2]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"newTab":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/app-root[1]/app-index[1]/div[1]/app-topic-flow[1]/div[1]/app-main-content[1]/div[1]/app-topic[3]/div[1]/div[1]/div[1]/app-talk-content[1]/div[1]/app-file-gallery[1]/div[1]/div[2]/div[2]","//div[contains(., '2023程序员人群洞')]","//DIV[@class='file-name']","/html/body/app-root/app-index/div/app-topic-flow/div/app-main-content/div/app-topic[last()-17]/div/div/div[last()-1]/app-talk-content/div/app-file-gallery/div/div/div"]}},{"id":3,"index":4,"parentId":0,"type":0,"option":5,"title":"执行JavaScript","sequence":[],"isInLoop":false,"position":2,"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"codeMode":0,"code":"document.elementFromPoint(20,20).click();","waitTime":0,"recordASField":0,"paraType":"text","emailConfig":{"host":"","port":465,"username":"","password":"","from":"","to":"","subject":"","content":""}}}]}

+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/23.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/24.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/25.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/26.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/27.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/28.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/3.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/4.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/5.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/6.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/7.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/8.json
File diff soppresso perché troppo grande
Vedi File


+ 0
- 1
.temp_to_pub/EasySpider_windows_x64/execution_instances/9.json
File diff soppresso perché troppo grande
Vedi File


+ 1
- 1
.temp_to_pub/EasySpider_windows_x64/tasks/149.json Vedi File

@ -1 +1 @@
{"id":149,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/7/2023, 6:36:49 AM","version":"0.3.5","saveThreshold":10,"cloudflare":0,"environment":0,"maxViewLength":15,"outputFormat":"mysql","saveName":"京东","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"params":[{"nodeType":1,"contentType":0,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"6h61epq3t9sljrq1vbg","iframe":false,"default":"","paraType":"text","beforeJS":"arguments[0].innerText = \"'\" + arguments[0].innerText + '\"'","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"6h61epq3t9sljrq1vbg","iframe":false,"default":"","paraType":"text","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}
{"id":149,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"7/7/2023, 6:36:49 AM","update_time":"12/20/2023, 4:03:13 AM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"mysql","saveName":"京东","dataWriteMode":1,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":1,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"List of URLs to be collected, separated by \\n for multiple lines","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"params":[{"nodeType":1,"contentType":0,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"6h61epq3t9sljrq1vbg","iframe":false,"default":"","paraType":"text","beforeJS":"arguments[0].innerText = \"'\" + arguments[0].innerText + '\"'","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"6h61epq3t9sljrq1vbg","iframe":false,"default":"","paraType":"text","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}

+ 1
- 1
.temp_to_pub/EasySpider_windows_x64/tasks/213.json
File diff soppresso perché troppo grande
Vedi File


+ 1
- 1
.temp_to_pub/EasySpider_windows_x64/tasks/296.json
File diff soppresso perché troppo grande
Vedi File


+ 1
- 1
.temp_to_pub/EasySpider_windows_x64/tasks/8.json
File diff soppresso perché troppo grande
Vedi File


+ 1
- 0
.temp_to_pub/compress.py Vedi File

@ -31,6 +31,7 @@ def compress_folder_to_7z(folder_path, output_file):
# archive.writeall(folder_path, output_file)
# 压缩文件夹
try:
# "-mmt4"表示使用4个线程压缩
subprocess.call(["7z", "a", output_file, folder_path])
except:
subprocess.call(["7za", "a", output_file, folder_path])

BIN
ElectronJS/EasySpider_en.crx Vedi File


BIN
ElectronJS/EasySpider_zh.crx Vedi File


+ 1
- 0
ElectronJS/clean_and_release_win32.cmd Vedi File

@ -21,6 +21,7 @@ xcopy ..\ExecuteStage\undetected_chromedriver_ES ..\.temp_to_pub\EasySpider_wind
xcopy ..\ExecuteStage\.vscode ..\.temp_to_pub\EasySpider_windows_x32\Code\.vscode /E /I /Y
rmdir /s /q ..\.temp_to_pub\EasySpider_windows_x32\user_data
rmdir /s /q ..\.temp_to_pub\EasySpider_windows_x32\execution_instances
rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x32\TempUserDataFolder
mkdir ..\.temp_to_pub\EasySpider_windows_x32\execution_instances
rmdir /s /q ..\.temp_to_pub\EasySpider_windows_x32\Data
mkdir ..\.temp_to_pub\EasySpider_windows_x32\Data

+ 1
- 0
ElectronJS/clean_and_release_win64.cmd Vedi File

@ -21,6 +21,7 @@ xcopy ..\ExecuteStage\undetected_chromedriver_ES ..\.temp_to_pub\EasySpider_wind
xcopy ..\ExecuteStage\.vscode ..\.temp_to_pub\EasySpider_windows_x64\Code\.vscode /E /I /Y
rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\user_data
rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\execution_instances
rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\TempUserDataFolder
mkdir ..\.temp_to_pub\EasySpider_windows_x64\execution_instances
rmdir /s /Q ..\.temp_to_pub\EasySpider_windows_x64\Data
mkdir ..\.temp_to_pub\EasySpider_windows_x64\Data

+ 7
- 1
ElectronJS/main.js Vedi File

@ -950,13 +950,19 @@ async function runBrowser(lang = "en", user_data_folder = '', mobile = false) {
await cdpConnection.execute('Page.addScriptToEvaluateOnNewDocument', {
source: stealth,
});
if (config_context.user_data_folder == "") {
//调整浏览器窗口大小
let size = await driver.manage().window().getRect();
let width = size.width;
let height = size.height;
await driver.manage().window().setRect({width: width * 1.2, height: height});
}
try {
if (mobile) {
await driver.get(server_address + "/taskGrid/taskList.html?wsport=" + websocket_port + "&backEndAddressServiceWrapper=" + server_address + "&mobile=1&lang=" + lang);
} else {
await driver.get(server_address + "/taskGrid/taskList.html?wsport=" + websocket_port + "&backEndAddressServiceWrapper=" + server_address + "&lang=" + lang);
}
old_handles = await driver.getAllWindowHandles();
current_handle = old_handles[old_handles.length - 1];
} finally {

+ 1
- 0
ElectronJS/package_linux64.sh Vedi File

@ -14,6 +14,7 @@ rm -rf out/EasySpider/resources/app/.idea
rm -rf out/EasySpider/resources/app/tasks
rm -rf out/EasySpider/resources/app/execution_instances
rm -rf out/EasySpider/resources/app/user_data
rm -rf out/EasySpider/resources/app/TempUserDataFolder
rm -rf ../.temp_to_pub/EasySpider_Linux_x64/EasySpider
rm out/EasySpider/resources/app/vs_BuildTools.exe
mv out/EasySpider ../.temp_to_pub/EasySpider_Linux_x64/EasySpider

+ 1
- 0
ElectronJS/package_macos.sh Vedi File

@ -17,6 +17,7 @@ rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resource
rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/tasks
rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/execution_instances
rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/user_data
rm -r ../.temp_to_pub/EasySpider_MacOS_all_arch/EasySpider.app/Contents/Resources/app/TempUserDataFolder
rm -rf ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
mkdir ../.temp_to_pub/EasySpider_MacOS_all_arch/Code
cp ../ExecuteStage/easyspider_executestage.py ../.temp_to_pub/EasySpider_MacOS_all_arch/Code

+ 1
- 1
ElectronJS/src/taskGrid/FlowChart.html Vedi File

@ -651,7 +651,7 @@ If the expression returns a value greater than 0 or evaluates to True, the opera
</div>
<div v-else-if='TClass == 7'>
<label>Code/Script Content (<a href="https://github.com/NaiboWang/EasySpider/wiki/Example-of-JavaScript-instruction-for-the-current-iteration-in-a-conditional-statement" target="_blank">Click here</a> for more examples): </label>
<textarea spellcheck=false onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="Enter the JS command for the current loop item. The loop item is represented by arguments[0]. If the return value is greater than 0 or true, the operations within this branch will be executed; otherwise, they will not be executed. For example: return arguments[0].innerText.length >= 5, which checks if the text length of the current loop item is greater than 5. Note that this is used in combination with element-related loop types (e.g., non-fixed element lists)."></textarea>
<textarea spellcheck=false onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="Enter the JS command for the current loop item. The loop item is represented by arguments[0]. If the return value is greater than 0 or true, the operations within this branch will be executed; otherwise, they will not be executed. For example: return arguments[0].innerText.length > 5, which checks if the text length of the current loop item is greater than 5. Note that this is used in combination with element-related loop types (e.g., non-fixed element lists)."></textarea>
<label>Maximum wait time for script execution (0 represents unlimited wait time): </label>
<input spellcheck=false onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
</div>

+ 1
- 1
ElectronJS/src/taskGrid/FlowChart_CN.html Vedi File

@ -651,7 +651,7 @@ print(emotlib.emoji()) # 使用其中的函数。
</div>
<div v-else-if='TClass == 7'>
<label>代码/脚本内容(<a href="https://github.com/NaiboWang/EasySpider/wiki/Example-of-JavaScript-instruction-for-the-current-iteration-in-a-conditional-statement" target="_blank">点击此处</a>查看更多示例): </label>
<textarea spellcheck=false onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="输入针对该循环项的JS命令,该循环项用arguments[0]表示,返回值大于0或为真则执行此分支内操作,否则不执行。如:return arguments[0].innerText.length >=5 即判断当前循环项的文本长度是否大于5,注意要配合循环类型为元素相关(如不固定元素列表)使用。"></textarea>
<textarea spellcheck=false onkeydown="inputDelete(event)" class="form-control" rows="3" v-model='nowNode["parameters"]["code"]' placeholder="输入针对该循环项的JS命令,该循环项用arguments[0]表示,返回值大于0或为真则执行此分支内操作,否则不执行。如:return arguments[0].innerText.length >5 即判断当前循环项的文本长度是否大于5,注意要配合循环类型为元素相关(如不固定元素列表)使用。"></textarea>
<label>最长等待脚本执行时间(0代表无限等待): </label>
<input spellcheck=false onkeydown="inputDelete(event)" required class="form-control" type="number" v-model.number='nowNode["parameters"]["waitTime"]'></input>
</div>

+ 1
- 1
ElectronJS/src/taskGrid/executeTask.html Vedi File

@ -10,7 +10,7 @@
<script src="vue.js"></script>
<script src="bootstrap/js/bootstrap.js"></script>
<link href="bootstrap/css/bootstrap.css" rel="stylesheet"></link>
<title>任务执行 | Task Execute</title>
<title>任务执行 | Task Execution</title>
<style>
table {
table-layout: auto;

+ 1
- 1
ExecuteStage/.vscode/launch.json Vedi File

@ -12,7 +12,7 @@
"justMyCode": false,
// "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
"args": ["--ids", "[40]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
"args": ["--ids", "[14]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
"--read_type", "remote"]
// "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
}

+ 2
- 2
ExecuteStage/easyspider_executestage.py Vedi File

@ -513,8 +513,8 @@ class BrowserThread(Thread):
self.urlId = self.urlId + 1
files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
# 如果目录为空,则删除该目录
if not files:
os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
# if not files:
# os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
self.print_and_log("Done!")
self.print_and_log("执行完成!")
self.saveData(exit=True)

+ 4
- 0
ExecuteStage/utils.py Vedi File

@ -156,6 +156,10 @@ def on_release_creator(event, press_time):
# time.sleep(1) # 每秒检查一次
def detect_optimizable(param, ignoreWaitElement=True, waitElement=""):
try:
splitLine = param["splitLine"]
except:
param["splitLine"] = 0
if param["beforeJS"] == "" and param["afterJS"] == "" and param["contentType"] <= 1 and param["splitLine"] == 0:
if param["nodeType"] <= 2:
if ignoreWaitElement or waitElement == "":

Caricamento…
Annulla
Salva