@ -2,6 +2,7 @@
# import atexit
import atexit
import copy
import platform
import shutil
import string
import undetected_chromedriver as uc
@ -1711,6 +1712,7 @@ class BrowserThread(Thread):
p [ " relativeXPath " ] , self . outputParameters , self )
# 只有当前环境不变变化才可以快速提取数据
if self . browser . iframe_env != p [ " iframe " ] :
# if p["iframe"] or self.browser.iframe_env != p["iframe"]: # 如果是iframe,则不能快速提取数据,主要是各个上下文的iframe切换,但一般不会有人这么做
p [ " optimizable " ] = False
continue
# relativeXPath = relativeXPath.lower()
@ -1820,7 +1822,7 @@ class BrowserThread(Thread):
element = self . browser . find_element (
By . XPATH , relativeXPath , iframe = p [ " iframe " ] )
except (
NoSuchElementException , InvalidSelectorException , StaleElementReferenceException ) : # 找不到元素的时候,使用默认值
NoSuchElementException , InvalidSelectorException , StaleElementReferenceException ) as e : # 找不到元素的时候,使用默认值
# self.print_and_log(p)
try :
content = p [ " default " ]
@ -1835,6 +1837,7 @@ class BrowserThread(Thread):
self . print_and_log (
" 提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错 " % (
p [ " name " ] , relativeXPath ) )
self . dataNotFoundKeys [ p [ " name " ] ] = True
except :
pass
continue
@ -1916,92 +1919,57 @@ if __name__ == '__main__':
print ( c )
options = webdriver . ChromeOptions ( )
driver_path = " chromedriver.exe "
import platform
print ( sys . platform , platform . architecture ( ) )
# option = webdriver.ChromeOptions()
if not os . path . exists ( os . getcwd ( ) + " /Data " ) :
os . mkdir ( os . getcwd ( ) + " /Data " )
if sys . platform == " darwin " and platform . architecture ( ) [ 0 ] == " 64bit " :
options . binary_location = " EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome "
# MacOS需要用option而不是options!
# option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
# option.add_extension(
# "EasySpider.app/Contents/Resources/app/XPathHelper.crx")
options . add_extension (
" EasySpider.app/Contents/Resources/app/XPathHelper.crx " )
driver_path = " EasySpider.app/Contents/Resources/app/chromedriver_mac64 "
# options.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
# # MacOS需要用option而不是options!
# option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
# driver_path = os.getcwd()+ "/chromedriver_mac64"
print ( driver_path )
if c . config_folder == " " :
c . config_folder = os . path . expanduser (
" ~/Library/Application Support/EasySpider/ " )
# print("Config folder for MacOS:", c.config_folder)
elif os . path . exists ( os . getcwd ( ) + " /EasySpider/resources " ) : # 打包后的路径
print ( " Finding chromedriver in EasySpider " ,
os . getcwd ( ) + " /EasySpider " )
if sys . platform == " win32 " and platform . architecture ( ) [ 0 ] == " 32bit " :
options . binary_location = os . path . join (
os . getcwd ( ) , " EasySpider/resources/app/chrome_win32/chrome.exe " ) # 指定chrome位置
# option.binary_location = os.path.join(
# os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
driver_path = os . path . join (
os . getcwd ( ) , " EasySpider/resources/app/chrome_win32/chromedriver_win32.exe " )
# option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options . add_extension ( " EasySpider/resources/app/XPathHelper.crx " )
elif sys . platform == " win32 " and platform . architecture ( ) [ 0 ] == " 64bit " :
options . binary_location = os . path . join (
os . getcwd ( ) , " EasySpider/resources/app/chrome_win64/chrome.exe " )
# option.binary_location = os.path.join(
# os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
driver_path = os . path . join (
os . getcwd ( ) , " EasySpider/resources/app/chrome_win64/chromedriver_win64.exe " )
# option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options . add_extension ( " EasySpider/resources/app/XPathHelper.crx " )
elif sys . platform == " linux " and platform . architecture ( ) [ 0 ] == " 64bit " :
options . binary_location = " EasySpider/resources/app/chrome_linux64/chrome "
# option.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
driver_path = " EasySpider/resources/app/chrome_linux64/chromedriver_linux64 "
# option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options . add_extension ( " EasySpider/resources/app/XPathHelper.crx " )
else :
print ( " Unsupported platform " )
sys . exit ( )
print ( " Chrome location: " , options . binary_location )
print ( " Chromedriver location: " , driver_path )
# elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
# print("Finding chromedriver in ./Chrome",
# os.getcwd()+"/Chrome")
# options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
# # option.binary_location = "C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe"
# driver_path = "./Chrome/chromedriver.exe"
elif os . path . exists ( os . getcwd ( ) + " /../ElectronJS " ) :
# 软件dev用
print ( " Finding chromedriver in EasySpider " ,
os . getcwd ( ) + " /ElectronJS " )
# option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
options . binary_location = " ../ElectronJS/chrome_win64/chrome.exe " # 指定chrome位置
driver_path = " ../ElectronJS/chrome_win64/chromedriver_win64.exe "
# option.add_extension("../ElectronJS/XPathHelper.crx")
options . add_extension ( " ../ElectronJS/XPathHelper.crx " )
else :
options . binary_location = " ./chrome.exe " # 指定chrome位置
# option.binary_location = "./chrome.exe" # 指定chrome位置
driver_path = " ./chromedriver.exe "
# option.add_extension("XPathHelper.crx")
options . add_extension ( " XPathHelper.crx " )
# option.add_experimental_option(
# 'excludeSwitches', ['enable-automation']) # 以开发者模式
options . add_experimental_option (
' excludeSwitches ' , [ ' enable-automation ' ] ) # 以开发者模式
# user_data_dir = r'' # 注意没有Default!
# options.add_argument('--user-data-dir='+p)
# 总结:
# 0. 带Cookie需要用userdatadir
@ -2018,22 +1986,15 @@ if __name__ == '__main__':
except :
pass
# options.add_argument(
# '--user-data-dir=C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\User Data') # TMALL 反扒
# option.add_argument(
# "--disable-blink-features=AutomationControlled") # TMALL 反扒
options . add_argument (
" --disable-blink-features=AutomationControlled " ) # TMALL 反扒
options . add_argument ( ' -ignore-certificate-errors ' )
options . add_argument ( ' -ignore -ssl-errors ' )
# option.add_argument('-ignore-certificate-errors')
# option.add_argument('-ignore -ssl-errors')
if c . headless :
print ( " Headless mode " )
print ( " 无头模式 " )
# option.add_argument("--headless")
options . add_argument ( " --headless " )
tmp_options = [ ]
@ -2058,11 +2019,7 @@ if __name__ == '__main__':
shutil . copytree ( absolute_user_data_folder , tmp_user_data_folder )
print ( " User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually. " )
print ( " 用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。 " )
# option = tmp_options[i]["option"]
options = tmp_options [ i ] [ " options " ]
# option.add_argument(
# f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
# option.add_argument("--profile-directory=Default")
options . add_argument (
f ' --user-data-dir={tmp_user_data_folder} ' ) # TMALL 反扒
options . add_argument ( " --profile-directory=Default " )
@ -2074,7 +2031,6 @@ if __name__ == '__main__':
threads = [ ]
for i in range ( len ( c . ids ) ) :
id = c . ids [ i ]
# option = tmp_options[i]["option"]
options = tmp_options [ i ] [ " options " ]
print ( " id: " , id )
if c . read_type == " remote " :
@ -2100,7 +2056,6 @@ if __name__ == '__main__':
cloudflare = 0
if cloudflare == 0 :
options . add_argument ( ' log-level=3 ' ) # 隐藏日志
# option.add_argument('log-level=3') # 隐藏日志
path = os . path . join ( os . path . abspath ( " ./ " ) , " Data " , " Task_ " + str ( id ) )
print ( " Data path: " , path )
options . add_experimental_option ( " prefs " , {
@ -2116,37 +2071,17 @@ if __name__ == '__main__':
' safebrowsing.disable_download_protection ' : True ,
' profile.default_content_settings.popups ' : 0 ,
} )
# option.add_experimental_option("prefs", {
# # 设置文件下载路径
# "download.default_directory": path,
# "download.prompt_for_download": False, # 禁止下载提示框
# "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
# "download.directory_upgrade": True,
# "download.extensions_to_open": "applications/pdf",
# "plugins.always_open_pdf_externally": True, # 总是在外部程序中打开PDF
# "safebrowsing_for_trusted_sources_enabled": False,
# "safebrowsing.enabled": False,
# 'safebrowsing.enabled': False,
# 'safebrowsing.disable_download_protection': True,
# 'profile.default_content_settings.popups': 0,
# })
try :
if service [ " environment " ] == 1 :
# option.add_experimental_option(
# 'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
options . add_experimental_option (
' mobileEmulation ' , { ' deviceName ' : ' iPhone X ' } ) # 模拟iPhone X浏览
except :
pass
# browser_t = MyChrome(
# options=options, chrome_options=option, executable_path=driver_path)
selenium_service = Service ( executable_path = driver_path )
browser_t = MyChrome ( service = selenium_service , options = options )
elif cloudflare == 1 :
if sys . platform == " win32 " :
options . binary_location = " C: \\ Program Files \\ Google \\ Chrome Beta \\ Application \\ chrome.exe " # 需要用自己的浏览器
# options.add_argument("--auto-open-devtools-for-tabs")
# options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" # 需要用自己的浏览器
browser_t = MyUCChrome (
options = options , driver_executable_path = driver_path )
links = list ( filter ( isnotnull , service [ " links " ] . split ( " \n " ) ) )
@ -2200,8 +2135,6 @@ if __name__ == '__main__':
# print("您的操作系统不支持暂停功能。")
# print("Your operating system does not support the pause function.")
# print("线程长度:", len(threads) )
for thread in threads :
print ( )
thread . join ( )