Parcourir la source

更改提取顺序

pull/73/head
naibo il y a 1 an
Parent
révision
ba7bd000e4
5 fichiers modifiés avec 96 ajouts et 92 suppressions
  1. +1
    -0
      ElectronJS/tasks/84.json
  2. +1
    -0
      ElectronJS/tasks/85.json
  3. +92
    -92
      ExecuteStage/easyspider_executestage.py
  4. +1
    -0
      Releases/EasySpider_windows_amd64/execution_instances/0.json
  5. +1
    -0
      Releases/EasySpider_windows_amd64/tasks/24.json

+ 1
- 0
ElectronJS/tasks/84.json
Fichier diff supprimé car celui-ci est trop grand
Voir le fichier


+ 1
- 0
ElectronJS/tasks/85.json Voir le fichier

@ -0,0 +1 @@
{"id":85,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/28/2023, 1:41:36 AM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_图片页面标题","desc":"","type":"string","exampleValue":"京东全球版-专业的综合网上购物商城"},{"id":1,"name":"参数2_图片页面网址","desc":"","type":"string","exampleValue":"https://global.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":4,"contentType":6,"relative":false,"name":"参数1_图片页面标题","desc":"","extractType":0,"relativeXPath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/a[1]/img[1]","allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[2]/div[1]/div[1]/a[1]/img[1]","//img[contains(., '')]"],"exampleValues":[{"num":0,"value":"京东全球版-专业的综合网上购物商城"}],"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0},{"nodeType":4,"contentType":5,"relative":false,"name":"参数2_图片页面网址","desc":"","extractType":0,"relativeXPath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[4]/div[1]/div[1]/a[1]/img[1]","allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[4]/div[1]/div[1]/a[1]/img[1]","//img[contains(., '')]"],"exampleValues":[{"num":0,"value":"https://global.jd.com/"}],"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}

+ 92
- 92
ExecuteStage/easyspider_executestage.py Voir le fichier

@ -797,101 +797,101 @@ class BrowserThread(Thread):
def get_content(self, p, element):
content = ""
# 先处理特殊节点类型
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
try:
downloadPic = p["downloadPic"]
except:
downloadPic = 0
if downloadPic == 1:
download_image(content, "Data/" + self.saveName + "/")
else: # 普通节点
if p["contentType"] == 0:
content = element.text
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
command = 'var arr = [];\
var content = arguments[0];\
for(var i = 0, len = content.childNodes.length; i < len; i++) {\
if(content.childNodes[i].nodeType === 3){ \
arr.push(content.childNodes[i].nodeValue);\
}\
}\
var str = arr.join(" "); \
return str;'
content = self.browser.execute_script(command, element).replace(
"\n", "").replace("\\s+", " ")
elif p["contentType"] == 2:
content = element.get_attribute('innerHTML')
elif p["contentType"] == 3:
content = element.get_attribute('outerHTML')
elif p["contentType"] == 4:
# 获取元素的背景图片地址
bg_url = element.value_of_css_property('background-image')
# 清除背景图片地址中的多余字符
bg_url = bg_url.replace('url("', '').replace('")', '')
content = bg_url
elif p["contentType"] == 5:
content = self.browser.current_url
elif p["contentType"] == 6:
content = self.browser.title
elif p["contentType"] == 7:
# 获取整个网页的高度和宽度
height = self.browser.execute_script("return document.body.scrollHeight");
width = self.browser.execute_script("return document.body.scrollWidth");
# 调整浏览器窗口的大小
self.browser.set_window_size(width, height)
element.screenshot("Data/" + self.saveName + "/"+ str(time.time()) + ".png")
elif p["contentType"] == 8:
try:
screenshot = element.screenshot_as_png
screenshot_stream = io.BytesIO(screenshot)
# 使用Pillow库打开截图,并转换为灰度图像
image = Image.open(screenshot_stream).convert('L')
# 使用Tesseract OCR引擎识别图像中的文本
text = pytesseract.image_to_string(image, lang='chi_sim+eng')
content = text
except Exception as e:
content = "OCR Error"
print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH (need to restart EasySpider after you put in PATH): https://tesseract-ocr.github.io/tessdoc/Installation.html")
if sys.platform == "win32":
print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
elif sys.platform == "darwin":
print(e)
print("注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/146044810")
elif sys.platform == "linux":
print(e)
print("注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/420259031")
else:
print(e)
print("注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
elif p["contentType"] == 9:
content = self.execute_code(2, p["JS"], p["JSWaitTime"], element)
elif p["contentType"] == 10: # 下拉框选中的值
try:
select_element = Select(element)
content = select_element.first_selected_option.get_attribute("value")
except:
if p["contentType"] == 0:
# 先处理特殊节点类型
if p["nodeType"] == 2:
if element.get_attribute("href") != None:
content = element.get_attribute("href")
else:
content = ""
elif p["nodeType"] == 3:
if element.get_attribute("value") != None:
content = element.get_attribute("value")
else:
content = ""
elif p["nodeType"] == 4: # 图片
if element.get_attribute("src") != None:
content = element.get_attribute("src")
else:
content = ""
elif p["contentType"] == 11: # 下拉框选中的文本
try:
select_element = Select(element)
content = select_element.first_selected_option.text
downloadPic = p["downloadPic"]
except:
content = ""
downloadPic = 0
if downloadPic == 1:
download_image(content, "Data/" + self.saveName + "/")
else: # 普通节点
content = element.text
elif p["contentType"] == 1: # 只采集当期元素下的文本,不包括子元素
command = 'var arr = [];\
var content = arguments[0];\
for(var i = 0, len = content.childNodes.length; i < len; i++) {\
if(content.childNodes[i].nodeType === 3){ \
arr.push(content.childNodes[i].nodeValue);\
}\
}\
var str = arr.join(" "); \
return str;'
content = self.browser.execute_script(command, element).replace(
"\n", "").replace("\\s+", " ")
elif p["contentType"] == 2:
content = element.get_attribute('innerHTML')
elif p["contentType"] == 3:
content = element.get_attribute('outerHTML')
elif p["contentType"] == 4:
# 获取元素的背景图片地址
bg_url = element.value_of_css_property('background-image')
# 清除背景图片地址中的多余字符
bg_url = bg_url.replace('url("', '').replace('")', '')
content = bg_url
elif p["contentType"] == 5:
content = self.browser.current_url
elif p["contentType"] == 6:
content = self.browser.title
elif p["contentType"] == 7:
# 获取整个网页的高度和宽度
height = self.browser.execute_script("return document.body.scrollHeight");
width = self.browser.execute_script("return document.body.scrollWidth");
# 调整浏览器窗口的大小
self.browser.set_window_size(width, height)
element.screenshot("Data/" + self.saveName + "/"+ str(time.time()) + ".png")
elif p["contentType"] == 8:
try:
screenshot = element.screenshot_as_png
screenshot_stream = io.BytesIO(screenshot)
# 使用Pillow库打开截图,并转换为灰度图像
image = Image.open(screenshot_stream).convert('L')
# 使用Tesseract OCR引擎识别图像中的文本
text = pytesseract.image_to_string(image, lang='chi_sim+eng')
content = text
except Exception as e:
content = "OCR Error"
print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH (need to restart EasySpider after you put in PATH): https://tesseract-ocr.github.io/tessdoc/Installation.html")
if sys.platform == "win32":
print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
elif sys.platform == "darwin":
print(e)
print("注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/146044810")
elif sys.platform == "linux":
print(e)
print("注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/420259031")
else:
print(e)
print("注意以上错误,要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
elif p["contentType"] == 9:
content = self.execute_code(2, p["JS"], p["JSWaitTime"], element)
elif p["contentType"] == 10: # 下拉框选中的值
try:
select_element = Select(element)
content = select_element.first_selected_option.get_attribute("value")
except:
content = ""
elif p["contentType"] == 11: # 下拉框选中的文本
try:
select_element = Select(element)
content = select_element.first_selected_option.text
except:
content = ""
return content

+ 1
- 0
Releases/EasySpider_windows_amd64/execution_instances/0.json Voir le fichier

@ -0,0 +1 @@
{"id":0,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/28/2023, 1:46:16 AM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_图片页面标题","desc":"","type":"string","exampleValue":"京东全球版-专业的综合网上购物商城"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":4,"contentType":6,"relative":false,"name":"参数1_图片页面标题","desc":"","extractType":0,"relativeXPath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[3]/div[1]/div[1]/a[1]/img[1]","allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[3]/div[1]/div[1]/a[1]/img[1]","//img[contains(., '')]"],"exampleValues":[{"num":0,"value":"京东全球版-专业的综合网上购物商城"}],"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}

+ 1
- 0
Releases/EasySpider_windows_amd64/tasks/24.json Voir le fichier

@ -0,0 +1 @@
{"id":24,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/28/2023, 1:46:16 AM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_图片页面标题","desc":"","type":"string","exampleValue":"京东全球版-专业的综合网上购物商城"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":4,"contentType":6,"relative":false,"name":"参数1_图片页面标题","desc":"","extractType":0,"relativeXPath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[3]/div[1]/div[1]/a[1]/img[1]","allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[2]/div[1]/div[1]/div[1]/div[2]/div[1]/div[3]/div[1]/div[1]/a[1]/img[1]","//img[contains(., '')]"],"exampleValues":[{"num":0,"value":"京东全球版-专业的综合网上购物商城"}],"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}]}}]}

Chargement…
Annuler
Enregistrer