Browse Source

New Download Location

pull/281/head
naibo 8 months ago
parent
commit
4f858ffee1
9 changed files with 68 additions and 13 deletions
  1. +3
    -0
      ElectronJS/src/taskGrid/FlowChart.html
  2. +3
    -0
      ElectronJS/src/taskGrid/FlowChart_CN.html
  3. +1
    -0
      ElectronJS/src/taskGrid/logic.js
  4. +1
    -1
      ElectronJS/src/taskGrid/taskList.html
  5. +1
    -1
      ElectronJS/tasks/228.json
  6. +1
    -0
      ElectronJS/tasks/311.json
  7. +1
    -1
      ExecuteStage/.vscode/launch.json
  8. +23
    -10
      ExecuteStage/easyspider_executestage.py
  9. +34
    -0
      ExecuteStage/utils.py

+ 3
- 0
ElectronJS/src/taskGrid/FlowChart.html View File

@ -191,6 +191,8 @@
<input spellcheck=false onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['scrollCount']" type="number" required></input>
<label>Wait time after scrolling (in seconds):</label>
<input spellcheck=false onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['scrollWaitTime']" type="number" required></input>
<label>Maximum file download wait time (in seconds):</label>
<input spellcheck="false" onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['downloadWaitTime']" type="number" required></input>
<label>Way to handle pop-up windows after clicking:</label>
<p><select v-model='nowNode["parameters"]["alertHandleType"]' class="form-control">
<option :value = 0>No pop-up window</option>
@ -718,6 +720,7 @@ If the expression returns a value greater than 0 or evaluates to True, the opera
<select id="dataWriteMode" name="dataWriteMode" class="form-control">
<option value="1">Append (If the file exists, append to it)</option>
<option value="2">Overwrite (If the file exists, overwrite it)</option>
<option value=3>Rename on Write (renames file if it already exists)</option>
</select>
<!-- <label>Is it an extreme anti-scraping website like Cloudflare (<a href="https://www.bilibili.com/video/BV1Ph4y1E7R9/" target="_blank">Watch Tutorial</a>)?</label>-->
<!-- <select id="cloudflare" name="cloudflare" class="form-control">-->

+ 3
- 0
ElectronJS/src/taskGrid/FlowChart_CN.html View File

@ -191,6 +191,8 @@
<input spellcheck=false onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['scrollCount']" type="number" required></input>
<label>滚动后等待时间(秒):</label>
<input spellcheck=false onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['scrollWaitTime']" type="number" required></input>
<label>文件下载最长等待时间(秒):</label>
<input spellcheck=false onkeydown="inputDelete(event)" class="form-control" v-model.number="nowNode['parameters']['downloadWaitTime']" type="number" required></input>
<label>点击元素后如有弹窗出现,弹窗处理方式:</label>
<p><select v-model='nowNode["parameters"]["alertHandleType"]' class="form-control">
<option :value = 0>无弹窗</option>
@ -718,6 +720,7 @@ print(emotlib.emoji()) # 使用其中的函数。
<select id="dataWriteMode" name="dataWriteMode" class="form-control">
<option value=1>追加写入(如果文件已存在则在原文件后面追加)</option>
<option value=2>覆盖写入(如果文件已存在则覆盖原文件)</option>
<option value=3>重命名写入(如果文件已存在则重命名文件)</option>
</select>
<!-- <label>是否为Cloudflare等极端反爬网站(<a href="https://www.bilibili.com/video/BV1Ph4y1E7R9/" target="_blank">查看Cloudflare设计和执行教程</a>):</label>-->
<!-- <select id="cloudflare" name="cloudflare" class="form-control">-->

+ 1
- 0
ElectronJS/src/taskGrid/logic.js View File

@ -282,6 +282,7 @@ function addParameters(t) {
t["parameters"]["afterJS"] = ""; //执行后执行的js
t["parameters"]["afterJSWaitTime"] = 0; //执行后js等待时间
t["parameters"]["alertHandleType"] = 0; //弹窗处理类型,1代表确认,2代表取消
t["parameters"]["downloadWaitTime"] = 3600; //下载等待时间
} else if (t.option == 3) { //提取数据
t["parameters"]["clear"] = 0; //清空其他字段数据
t["parameters"]["newLine"] = 1; //生成新行

+ 1
- 1
ElectronJS/src/taskGrid/taskList.html View File

@ -63,7 +63,7 @@
<el-table
style="width: 100%"
:empty-text="LANG('No Task~暂无任务')"
:data="list.filter(data => !search || (data.name.toLowerCase().includes(search.toLowerCase())) || (data.url.toLowerCase().includes(search.toLowerCase())) || (data.links.includes(search.toLowerCase())) || (data.desc.includes(search.toLowerCase())))"
:data="list.filter(data => !search || (data.name.toLowerCase().includes(search.toLowerCase())) || (data.url.toLowerCase().includes(search.toLowerCase())) || (data.links.includes(search.toLowerCase())) || (data.desc.includes(search.toLowerCase())) || (data.id.toString().includes(search.toLowerCase())))"
:default-sort="{prop: 'mtime', order: 'descending'}"
>

+ 1
- 1
ElectronJS/tasks/228.json View File

@ -1 +1 @@
{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"12/7/2023, 2:56:47 AM","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"current_time","inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}}]}
{"id":228,"name":"[2312.02977] Exploring the nonclassical dynamics of the \"classical'' Schrödinger equation","url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","create_time":"12/7/2023, 2:44:58 AM","update_time":"2023-12-28 14:20:46","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":1,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"TTT","dataWriteMode":3,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://arxiv.org/abs/2312.02977","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://arxiv.org/abs/2312.02977","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://arxiv.org/abs/2312.02977"}],"outputParameters":[],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2,3],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://arxiv.org/abs/2312.02977","links":"https://arxiv.org/abs/2312.02977","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}},{"id":3,"index":3,"parentId":0,"type":0,"option":2,"title":"点击Download PDF","sequence":[],"isInLoop":false,"position":2,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"//*[contains(@class, \"download-pdf\")]","iframe":false,"wait":2,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"clickWay":0,"maxWaitTime":10,"params":[],"alertHandleType":0,"allXPaths":["/html/body/div[2]/main[1]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[1]/a[1]","//a[contains(., 'Download P')]","//A[@class='abs-button download-pdf']","/html/body/div[last()-3]/main/div/div/div[last()-2]/div[last()-5]/ul/li[last()-2]/a"]}}]}

+ 1
- 0
ElectronJS/tasks/311.json View File

@ -0,0 +1 @@
{"id":311,"name":"重命名测试","url":"https://www.jd.com","links":"https://www.jd.com","create_time":"2023-12-28 14:05:20","update_time":"2023-12-28 14:05:43","version":"0.6.0","saveThreshold":10,"quitWaitTime":60,"environment":0,"maximizeWindow":0,"maxViewLength":15,"recordLog":1,"outputFormat":"xlsx","saveName":"TTT","dataWriteMode":3,"inputExcel":"","startFromExit":0,"pauseKey":"p","containJudge":false,"browser":"chrome","removeDuplicate":0,"desc":"https://www.jd.com","inputParameters":[{"id":0,"name":"urlList_0","nodeId":1,"nodeName":"打开网页","value":"https://www.jd.com","desc":"要采集的网址列表,多行以\\n分开","type":"text","exampleValue":"https://www.jd.com"}],"outputParameters":[{"id":0,"name":"参数1_链接文本","desc":"","type":"text","recordASField":1,"exampleValue":"手机"},{"id":1,"name":"参数2_链接地址","desc":"","type":"text","recordASField":1,"exampleValue":"https://shouji.jd.com/"}],"graph":[{"index":0,"id":0,"parentId":0,"type":-1,"option":0,"title":"root","sequence":[1,2],"parameters":{"history":1,"tabIndex":0,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0},"isInLoop":false},{"id":1,"index":1,"parentId":0,"type":0,"option":1,"title":"打开网页","sequence":[],"isInLoop":false,"position":0,"parameters":{"useLoop":false,"xpath":"","wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"url":"https://www.jd.com","links":"https://www.jd.com","maxWaitTime":10,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"cookies":""}},{"id":2,"index":2,"parentId":0,"type":1,"option":8,"title":"循环采集数据","sequence":[3],"isInLoop":false,"position":1,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div/a","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"scrollType":0,"scrollCount":1,"scrollWaitTime":1,"loopType":1,"pathList":"","code":"","waitTime":0,"exitCount":0,"exitElement":"//body","historyWait":2,"breakMode":0,"breakCode":"","breakCodeWaitTime":0,"skipCount":0,"allXPaths":["/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/a[1]","//a[contains(., '手机')]","/html/body/div[last()-5]/div/div[last()-4]/div/div[last()-2]/div/div/div/div[last()-1]/div[last()-12]/a[last()-1]"]}},{"id":3,"index":3,"parentId":2,"type":0,"option":3,"title":"提取数据","sequence":[],"isInLoop":true,"position":0,"parameters":{"history":4,"tabIndex":-1,"useLoop":false,"xpath":"","iframe":false,"wait":0,"waitType":0,"beforeJS":"","beforeJSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"waitElement":"","waitElementTime":10,"waitElementIframeIndex":0,"clear":0,"newLine":1,"params":[{"nodeType":1,"contentType":0,"relative":true,"name":"参数1_链接文本","desc":"","extractType":0,"relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"手机"}],"unique_index":"zvn77ulso2lqoswqo4","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0},{"nodeType":2,"contentType":0,"relative":true,"name":"参数2_链接地址","desc":"","relativeXPath":"","allXPaths":"","exampleValues":[{"num":0,"value":"https://shouji.jd.com/"}],"unique_index":"zvn77ulso2lqoswqo4","iframe":false,"default":"","paraType":"text","recordASField":1,"beforeJS":"","beforeJSWaitTime":0,"JS":"","JSWaitTime":0,"afterJS":"","afterJSWaitTime":0,"downloadPic":0,"splitLine":0}]}}]}

+ 1
- 1
ExecuteStage/.vscode/launch.json View File

@ -12,7 +12,7 @@
"justMyCode": false,
// "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
"args": ["--ids", "[63]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
"args": ["--ids", "[67]", "--headless", "0", "--user_data", "0", "--keyboard", "0",
"--read_type", "remote"]
// "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
}

+ 23
- 10
ExecuteStage/easyspider_executestage.py View File

@ -112,9 +112,10 @@ class BrowserThread(Thread):
self.print_and_log("Save Name for task ID", id, "is:", self.saveName)
if not os.path.exists("Data/Task_" + str(id)):
os.mkdir("Data/Task_" + str(id))
if not os.path.exists("Data/Task_" + str(id) + "/" + self.saveName):
os.mkdir("Data/Task_" + str(id) + "/" +
self.saveName) # 创建保存文件夹用来保存截图
self.downloadFolder = "Data/Task_" + str(id) + "/" + self.saveName
if not os.path.exists(self.downloadFolder):
os.mkdir(self.downloadFolder) # 创建保存文件夹用来保存截图和文件
self.existing_files = sorted([os.path.join(self.downloadFolder, file) for file in os.listdir(self.downloadFolder)], key=os.path.getmtime)
self.getDataStep = 0
self.startSteps = 0
try:
@ -144,7 +145,7 @@ class BrowserThread(Thread):
'source': js}) # TMALL 反扒
WebDriverWait(self.browser, 10)
self.browser.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(self.id))
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(self.id), self.saveName)
self.paramss = {'cmd': 'Page.setDownloadBehavior', 'params': {'behavior': 'allow', 'downloadPath': path}}
self.browser.execute("send_command", self.paramss) # 下载地址改变
@ -187,12 +188,19 @@ class BrowserThread(Thread):
self.links = list(filter(isnotnull, service["url"])) # 要执行的link
self.OUTPUT = [] # 采集的数据
try:
self.dataWriteMode = service["dataWriteMode"] # 数据写入模式,1为追加,2为覆盖
self.dataWriteMode = service["dataWriteMode"] # 数据写入模式,1为追加,2为覆盖,3为重命名文件
except:
self.dataWriteMode = 1
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx" or self.outputFormat == "json":
if self.dataWriteMode == 2 and os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
os.remove("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat)
if os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
if self.dataWriteMode == 2:
os.remove("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat)
elif self.dataWriteMode == 3:
i = 2
while os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '_' + str(i) + '.' + self.outputFormat):
i = i + 1
self.saveName = self.saveName + '_' + str(i)
self.print_and_log("文件已存在,已重命名为", self.saveName)
self.writeMode = 1 # 写入模式,0为新建,1为追加
if self.outputFormat == "csv" or self.outputFormat == "txt" or self.outputFormat == "xlsx":
if not os.path.exists("Data/Task_" + str(self.id) + "/" + self.saveName + '.' + self.outputFormat):
@ -521,7 +529,7 @@ class BrowserThread(Thread):
"/", len(self.links))
self.executeNode(0)
self.urlId = self.urlId + 1
files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
# files = os.listdir("Data/Task_" + str(self.id) + "/" + self.saveName)
# 如果目录为空,则删除该目录
# if not files:
# os.rmdir("Data/Task_" + str(self.id) + "/" + self.saveName)
@ -1799,6 +1807,11 @@ class BrowserThread(Thread):
self.print_and_log("History Length Error")
self.history["index"] = 0
self.scrollDown(param) # 根据参数配置向下滚动
# 处理文件变化,新下载
files = os.listdir(self.downloadFolder)
latest_file = files[-1]
self.existing_files = files
# rt.end()
def get_content(self, p, element):
@ -2372,8 +2385,8 @@ if __name__ == '__main__':
cloudflare = 0
if cloudflare == 0:
options.add_argument('log-level=3') # 隐藏日志
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(id))
print("Data path:", path)
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(id), "files")
print("文件下载路径|File Download path:", path)
options.add_experimental_option("prefs", {
# 设置文件下载路径
"download.default_directory": path,

+ 34
- 0
ExecuteStage/utils.py View File

@ -61,6 +61,40 @@ def send_email(config):
pass
def wait_for_download_complete(download_dir, timeout=3600):
"""等待下载完成,直到没有.crdownload文件为止,或者超时"""
while True:
time.sleep(1) # 每一秒检查一次
timeout -= 1
is_downloading = False
for fname in os.listdir(download_dir):
if fname.endswith('.crdownload'):
is_downloading = True
break
# 如果没有下载或超时,则退出
if not is_downloading or timeout <= 0:
break
elif timeout % 10 == 0:
print(f"下载文件中,请等待...|Downloading in progress, please wait... {timeout} seconds left")
print("可以在点击元素选项中设置下载超时时间。|You can set the download timeout in the 'Click Element' option.")
if is_downloading:
print("下载可能未完成,但已经超时。|Download may not be completed, but it has timed out.")
else:
print("下载完成。|Download completed.")
def rename_downloaded_file(download_dir):
"""重命名下载文件,假设是最新下载的文件"""
files = os.listdir(download_dir)
paths = [os.path.join(download_dir, basename) for basename in files]
latest_file = max(paths, key=os.path.getmtime, default=None)
if latest_file is not None and not latest_file.endswith('.crdownload'):
new_name = latest_file.split('/')[-1] + '_' + str(uuid.uuid4()) + '_' + latest_file.split('/')[-1]
new_path = os.path.join(download_dir, new_name)
os.rename(latest_file, new_path)
print(f"文件已重命名为: {new_path}")
print(f"File has been renamed to: {new_path}")
def is_valid_url(url):
try:
result = urlparse(url)

Loading…
Cancel
Save