@ -5,7 +5,7 @@ import copy
import platform
import shutil
import string
import undetected_chromedriver as uc
# import undetected_chromedriver as uc
from utils import detect_optimizable , download_image , extract_text_from_html , get_output_code , isnotnull , lowercase_tags_in_xpath , myMySQL , new_line , \
on_press_creator , on_release_creator , readCode , replace_field_values , send_email , split_text_by_lines , write_to_csv , write_to_excel , write_to_json
from myChrome import MyChrome
@ -44,14 +44,24 @@ import sys
# import hashlib
import time
import requests
from ddddocr import DdddOcr
from multiprocessing import freeze_support
freeze_support ( ) # 防止无限死循环多开
try :
from ddddocr import DdddOcr
import onnxruntime
onnxruntime . set_default_logger_severity ( 3 ) # 隐藏onnxruntime的日志
except :
print ( " OCR识别无法在当前环境下使用(ddddocr库缺失),请使用完整版执行器easyspider_executestage_full来运行需要OCR识别的任务。 " )
print ( " OCR recognition cannot be used in the current environment (ddddocr library is missing), please use the executor with ddddocr ' easyspider_executestage_full ' to run the task which requires OCR recognition. " )
from urllib.parse import urljoin
from lxml import etree , html
try :
import pandas as pd
except :
print ( " 数据去重无法在当前环境下使用(pandas库缺失),请使用完整版执行器easyspider_executestage_full来运行需要去重的任务。 " )
print ( " Data deduplication cannot be used in the current environment (pandas library is missing), please use the executor with pandas ' easyspider_executestage_full ' to run the task which requires data deduplication. " )
time . sleep ( 1 )
import onnxruntime
onnxruntime . set_default_logger_severity ( 3 ) # 隐藏onnxruntime的日志
import pandas as pd
# import numpy
# import pytesseract
# import uuid
@ -651,26 +661,62 @@ class BrowserThread(Thread):
time . sleep ( param [ " scrollWaitTime " ] ) # 下拉完等待
except :
pass
except :
self . print_and_log ( ' Time out after set seconds when scrolling. ' )
except Exception as e :
self . print_and_log ( " 滚动屏幕时出错|Error scrolling screen: " , e )
try :
self . browser . execute_script ( ' window.stop() ' )
except :
pass
if scrollType != 0 and param [ " scrollCount " ] > 0 : # 控制屏幕向下滚动
for i in range ( param [ " scrollCount " ] ) :
self . print_and_log (
" Wait for set second after screen scrolling " )
body = self . browser . find_element (
By . CSS_SELECTOR , " body " , iframe = param [ " iframe " ] )
if scrollType == 1 :
body . send_keys ( Keys . PGDN )
elif scrollType == 2 :
if scrollType == 1 or scrollType == 2 :
for i in range ( param [ " scrollCount " ] ) :
body = self . browser . find_element (
By . CSS_SELECTOR , " body " , iframe = param [ " iframe " ] )
if scrollType == 1 :
body . send_keys ( Keys . PAGE_DOWN )
elif scrollType == 2 :
body . send_keys ( Keys . END )
try :
time . sleep ( param [ " scrollWaitTime " ] ) # 下拉完等待
except :
pass
self . print_and_log ( " 向下滚动,第 " , i + 1 , " 次。 " )
self . print_and_log (
" Scroll down, the " , i + 1 , " time. " )
elif scrollType == 3 :
bodyText = " "
i = 0
while True :
newBodyText = self . browser . find_element (
By . CSS_SELECTOR , " body " , iframe = False ) . text
if param [ " iframe " ] : # 如果标记了iframe
iframes = self . browser . find_elements (
By . CSS_SELECTOR , " iframe " , iframe = False )
for iframe in iframes :
self . browser . switch_to . default_content ( )
self . browser . switch_to . frame ( iframe )
iframe_text = super ( self . browser . __class__ , self . browser ) . find_element (
By . CSS_SELECTOR , " body " ) . text # 用super调用父类的方法
newBodyText + = iframe_text
self . browser . switch_to . default_content ( )
if newBodyText == bodyText :
self . print_and_log ( " 页面已检测不到新内容,停止滚动。 " )
self . print_and_log (
" No new content detected on the page, stop scrolling. " )
break
else :
bodyText = newBodyText
body = self . browser . find_element (
By . CSS_SELECTOR , " body " , iframe = param [ " iframe " ] )
body . send_keys ( Keys . END )
try :
time . sleep ( param [ " scrollWaitTime " ] ) # 下拉完等待
except :
pass
self . print_and_log ( " 滚动到底部,第 " , i + 1 , " 次。 " )
self . print_and_log (
" Scroll to the bottom, the " , i + 1 , " time. " )
i = i + 1
try :
time . sleep ( param [ " scrollWaitTime " ] ) # 下拉完等待
except :
pass
if rt != " " :
rt . end ( )
@ -1552,10 +1598,11 @@ class BrowserThread(Thread):
else :
value = param [ " value " ]
# 将value中的Field[""]替换为outputParameters中的键值
pattern = r ' Field \ [ " ([^ " ]+) " \ ] '
# pattern = r'Field\["([^"]+)"\] '
try :
replaced_text = re . sub (
pattern , lambda match : self . outputParameters . get ( match . group ( 1 ) , ' ' ) , value )
# replaced_text = re.sub(
# pattern, lambda match: self.outputParameters.get(match.group(1), ''), value)
replaced_text = replace_field_values ( value , self . outputParameters , self )
replaced_text = re . sub (
' <enter> ' , ' ' , replaced_text , flags = re . IGNORECASE )
except :
@ -2148,8 +2195,6 @@ class BrowserThread(Thread):
self . OUTPUT . append ( line )
if __name__ == ' __main__ ' :
from multiprocessing import freeze_support
freeze_support ( ) # 防止无限死循环多开
# 如果需要调试程序,请在命令行参数中加入--keyboard 0 来禁用键盘监听以提升调试速度
# If you need to debug the program, please add --keyboard 0 in the command line parameters to disable keyboard listening to improve debugging speed
config = {
@ -2303,7 +2348,13 @@ if __name__ == '__main__':
service = json . loads ( content . text ) # 加载服务信息
else :
print ( " local " )
with open ( " execution_instances/ " + str ( id ) + " .json " , ' r ' , encoding = ' utf-8 ' ) as f :
local_folder = os . path . join ( os . getcwd ( ) , " execution_instances " )
if sys . platform == " darwin " :
user_folder = os . path . expanduser (
" ~/Library/Application Support/EasySpider/ " )
local_folder = os . path . join ( user_folder , " execution_instances " )
file_path = os . path . join ( local_folder , str ( id ) + " .json " )
with open ( file_path , ' r ' , encoding = ' utf-8 ' ) as f :
content = f . read ( )
service = json . loads ( content ) # 加载服务信息
try :