Browse Source

Linux OCR Hint

pull/73/head
Naibo Wang 1 year ago
parent
commit
5853d2da00
4 changed files with 12 additions and 3 deletions
  1. +1
    -0
      ElectronJS/tasks/83.json
  2. +1
    -0
      ExecuteStage/.gitignore
  3. +2
    -2
      ExecuteStage/.vscode/launch.json
  4. +8
    -1
      ExecuteStage/easyspider_executestage.py

+ 1
- 0
ElectronJS/tasks/83.json View File

@ -0,0 +1 @@
{"id":83,"name":"京东全球版-专业的综合网上购物商城","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"5/27/2023, 7:33:15 PM","version":"0.3.1","containJudge":false,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"string","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_文本","desc":"","type":"string","exampleValue":"/手机/数码"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","wait":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","textList":"","code":"","waitTime":0,"exitCount":0,"historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]","//div[contains(., '/手机/数码')]","//DIV[@class='LeftSide_menu_item__SBMWC LeftSide_text_space__2UhbG ']"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","wait":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"paras":[{"nodeType":0,"contentType":8,"relative":true,"name":"参数1_文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"/手机/数码"},{"num":1,"value":"/家用电器"},{"num":2,"value":"/电脑/办公"},{"num":3,"value":"/家纺/家居/厨具"},{"num":4,"value":"/家具/家装/灯具/工业品"},{"num":5,"value":"/内衣/男装/女装/童装"},{"num":6,"value":"/箱包/钟表/珠宝/女鞋"},{"num":7,"value":"/运动/户外/男鞋"},{"num":8,"value":"/汽车用品/车载电器"},{"num":9,"value":"/母婴/洗护喂养"},{"num":10,"value":"/玩具乐器/宠物生活"},{"num":11,"value":"/家庭清洁/个人护理/计生情趣"},{"num":12,"value":"/图书/童书/文学"}],"default":"","beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0}],"loopType":1}}]}

+ 1
- 0
ExecuteStage/.gitignore View File

@ -1,3 +1,4 @@
EasySpider/
node_modules/ node_modules/
*.csv *.csv
.idea/ .idea/

+ 2
- 2
ExecuteStage/.vscode/launch.json View File

@ -2,7 +2,7 @@
// Use IntelliSense to learn about possible attributes. // Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes. // Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"version": "0.3.1",
"configurations": [ "configurations": [
{ {
"name": "Python: EasySpider", "name": "Python: EasySpider",
@ -12,7 +12,7 @@
"console": "integratedTerminal", "console": "integratedTerminal",
"justMyCode": true, "justMyCode": true,
// "args": ["--id", "38", "--read_type", "local", "--headless", "1"] // "args": ["--id", "38", "--read_type", "local", "--headless", "1"]
"args": ["--id", "[21, 22]", "--headless", "0", "--user_data", "1"]
"args": ["--id", "[5]", "--headless", "0", "--user_data", "0"]
} }
] ]
} }

+ 8
- 1
ExecuteStage/easyspider_executestage.py View File

@ -867,7 +867,14 @@ class BrowserThread(Thread):
except Exception as e: except Exception as e:
content = "OCR Error" content = "OCR Error"
print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH (need to restart EasySpider after you put in PATH): https://tesseract-ocr.github.io/tessdoc/Installation.html") print("To use OCR, You need to install Tesseract-OCR and add it to the environment variable PATH (need to restart EasySpider after you put in PATH): https://tesseract-ocr.github.io/tessdoc/Installation.html")
print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
if sys.platform == "win32":
print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
elif sys.platform == "darwin":
print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/146044810")
elif sys.platform == "linux":
print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://zhuanlan.zhihu.com/p/420259031")
else:
print("要使用OCR识别功能,你需要安装Tesseract-OCR并将其添加到环境变量PATH中(添加后需重启EasySpider):https://blog.csdn.net/u010454030/article/details/80515501\nhttps://www.bilibili.com/video/BV1xz4y1b72D/")
elif p["contentType"] == 9: elif p["contentType"] == 9:
content = self.execute_code(2, p["JS"], p["JSWaitTime"], element) content = self.execute_code(2, p["JS"], p["JSWaitTime"], element)
elif p["contentType"] == 10: # 下拉框选中的值 elif p["contentType"] == 10: # 下拉框选中的值

Loading…
Cancel
Save