Переглянути джерело

Iframe Nested

pull/254/head
naibo 9 місяці тому
джерело
коміт
c3773848c3
16 змінених файлів з 156 додано та 138 видалено
  1. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/145.json
  2. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/146.json
  3. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/147.json
  4. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/148.json
  5. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/execution_instances/149.json
  6. +1
    -0
      .temp_to_pub/EasySpider_windows_x64/tasks/238.json
  7. +1
    -0
      ElectronJS/tasks/230.json
  8. +1
    -0
      ElectronJS/tasks/231.json
  9. +1
    -0
      ElectronJS/tasks/232.json
  10. +1
    -0
      ElectronJS/tasks/233.json
  11. +2
    -2
      ExecuteStage/.vscode/launch.json
  12. +4
    -71
      ExecuteStage/easyspider_executestage.py
  13. +1
    -1
      ExecuteStage/generateEXE_win64.cmd
  14. +136
    -61
      ExecuteStage/myChrome.py
  15. +2
    -2
      Extension/manifest_v3/package-lock.json
  16. +1
    -1
      Extension/manifest_v3/src/content-scripts/config.json

+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/145.json
Різницю між файлами не показано, бо вона завелика
Переглянути файл


+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/146.json
Різницю між файлами не показано, бо вона завелика
Переглянути файл


+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/147.json
Різницю між файлами не показано, бо вона завелика
Переглянути файл


+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/148.json
Різницю між файлами не показано, бо вона завелика
Переглянути файл


+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/execution_instances/149.json
Різницю між файлами не показано, бо вона завелика
Переглянути файл


+ 1
- 0
.temp_to_pub/EasySpider_windows_x64/tasks/238.json
Різницю між файлами не показано, бо вона завелика
Переглянути файл


+ 1
- 0
ElectronJS/tasks/230.json
Різницю між файлами не показано, бо вона завелика
Переглянути файл


+ 1
- 0
ElectronJS/tasks/231.json
Різницю між файлами не показано, бо вона завелика
Переглянути файл


+ 1
- 0
ElectronJS/tasks/232.json
Різницю між файлами не показано, бо вона завелика
Переглянути файл


+ 1
- 0
ElectronJS/tasks/233.json
Різницю між файлами не показано, бо вона завелика
Переглянути файл


+ 2
- 2
ExecuteStage/.vscode/launch.json Переглянути файл

@ -12,8 +12,8 @@
"justMyCode": false,
// "args": ["--ids", "[7]", "--read_type", "remote", "--headless", "0"]
// "args": ["--ids", "[9]", "--read_type", "remote", "--headless", "0", "--saved_file_name", "YOUTUBE"]
// "args": ["--ids", "[1]", "--headless", "0", "--user_data", "1", "--keyboard", "1"]
"args": "--ids '[3]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
"args": ["--ids", "[149]", "--headless", "0", "--user_data", "0", "--keyboard", "0"]
// "args": "--ids '[97]' --user_data 1 --server_address http://localhost:8074 --config_folder '/Users/naibo/Documents/EasySpider/ElectronJS/' --headless 0 --read_type remote --config_file_name config.json --saved_file_name"
}
]
}

+ 4
- 71
ExecuteStage/easyspider_executestage.py Переглянути файл

@ -2,6 +2,7 @@
# import atexit
import atexit
import copy
import platform
import shutil
import string
import undetected_chromedriver as uc
@ -1711,6 +1712,7 @@ class BrowserThread(Thread):
p["relativeXPath"], self.outputParameters, self)
# 只有当前环境不变变化才可以快速提取数据
if self.browser.iframe_env != p["iframe"]:
# if p["iframe"] or self.browser.iframe_env != p["iframe"]: # 如果是iframe,则不能快速提取数据,主要是各个上下文的iframe切换,但一般不会有人这么做
p["optimizable"] = False
continue
# relativeXPath = relativeXPath.lower()
@ -1820,7 +1822,7 @@ class BrowserThread(Thread):
element = self.browser.find_element(
By.XPATH, relativeXPath, iframe=p["iframe"])
except (
NoSuchElementException, InvalidSelectorException, StaleElementReferenceException): # 找不到元素的时候,使用默认值
NoSuchElementException, InvalidSelectorException, StaleElementReferenceException) as e: # 找不到元素的时候,使用默认值
# self.print_and_log(p)
try:
content = p["default"]
@ -1835,6 +1837,7 @@ class BrowserThread(Thread):
self.print_and_log(
"提取数据操作时,字段名 %s 对应XPath %s 未找到,使用默认值,本字段将不再重复报错" % (
p["name"], relativeXPath))
self.dataNotFoundKeys[p["name"]] = True
except:
pass
continue
@ -1916,92 +1919,57 @@ if __name__ == '__main__':
print(c)
options = webdriver.ChromeOptions()
driver_path = "chromedriver.exe"
import platform
print(sys.platform, platform.architecture())
# option = webdriver.ChromeOptions()
if not os.path.exists(os.getcwd() + "/Data"):
os.mkdir(os.getcwd() + "/Data")
if sys.platform == "darwin" and platform.architecture()[0] == "64bit":
options.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
# MacOS需要用option而不是options!
# option.binary_location = "EasySpider.app/Contents/Resources/app/chrome_mac64.app/Contents/MacOS/Google Chrome"
# option.add_extension(
# "EasySpider.app/Contents/Resources/app/XPathHelper.crx")
options.add_extension(
"EasySpider.app/Contents/Resources/app/XPathHelper.crx")
driver_path = "EasySpider.app/Contents/Resources/app/chromedriver_mac64"
# options.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
# # MacOS需要用option而不是options!
# option.binary_location = "chrome_mac64.app/Contents/MacOS/Google Chrome"
# driver_path = os.getcwd()+ "/chromedriver_mac64"
print(driver_path)
if c.config_folder == "":
c.config_folder = os.path.expanduser(
"~/Library/Application Support/EasySpider/")
# print("Config folder for MacOS:", c.config_folder)
elif os.path.exists(os.getcwd() + "/EasySpider/resources"): # 打包后的路径
print("Finding chromedriver in EasySpider",
os.getcwd() + "/EasySpider")
if sys.platform == "win32" and platform.architecture()[0] == "32bit":
options.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
# option.binary_location = os.path.join(
# os.getcwd(), "EasySpider/resources/app/chrome_win32/chrome.exe") # 指定chrome位置
driver_path = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win32/chromedriver_win32.exe")
# option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
elif sys.platform == "win32" and platform.architecture()[0] == "64bit":
options.binary_location = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
# option.binary_location = os.path.join(
# os.getcwd(), "EasySpider/resources/app/chrome_win64/chrome.exe")
driver_path = os.path.join(
os.getcwd(), "EasySpider/resources/app/chrome_win64/chromedriver_win64.exe")
# option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
elif sys.platform == "linux" and platform.architecture()[0] == "64bit":
options.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
# option.binary_location = "EasySpider/resources/app/chrome_linux64/chrome"
driver_path = "EasySpider/resources/app/chrome_linux64/chromedriver_linux64"
# option.add_extension("EasySpider/resources/app/XPathHelper.crx")
options.add_extension("EasySpider/resources/app/XPathHelper.crx")
else:
print("Unsupported platform")
sys.exit()
print("Chrome location:", options.binary_location)
print("Chromedriver location:", driver_path)
# elif os.getcwd().find("ExecuteStage") >= 0: # 如果直接执行
# print("Finding chromedriver in ./Chrome",
# os.getcwd()+"/Chrome")
# options.binary_location = "./Chrome/chrome.exe" # 指定chrome位置
# # option.binary_location = "C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exe"
# driver_path = "./Chrome/chromedriver.exe"
elif os.path.exists(os.getcwd() + "/../ElectronJS"):
# 软件dev用
print("Finding chromedriver in EasySpider",
os.getcwd() + "/ElectronJS")
# option.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
options.binary_location = "../ElectronJS/chrome_win64/chrome.exe" # 指定chrome位置
driver_path = "../ElectronJS/chrome_win64/chromedriver_win64.exe"
# option.add_extension("../ElectronJS/XPathHelper.crx")
options.add_extension("../ElectronJS/XPathHelper.crx")
else:
options.binary_location = "./chrome.exe" # 指定chrome位置
# option.binary_location = "./chrome.exe" # 指定chrome位置
driver_path = "./chromedriver.exe"
# option.add_extension("XPathHelper.crx")
options.add_extension("XPathHelper.crx")
# option.add_experimental_option(
# 'excludeSwitches', ['enable-automation']) # 以开发者模式
options.add_experimental_option(
'excludeSwitches', ['enable-automation']) # 以开发者模式
# user_data_dir = r'' # 注意没有Default!
# options.add_argument('--user-data-dir='+p)
# 总结:
# 0. 带Cookie需要用userdatadir
@ -2018,22 +1986,15 @@ if __name__ == '__main__':
except:
pass
# options.add_argument(
# '--user-data-dir=C:\\Users\\q9823\\AppData\\Local\\Google\\Chrome\\User Data') # TMALL 反扒
# option.add_argument(
# "--disable-blink-features=AutomationControlled") # TMALL 反扒
options.add_argument(
"--disable-blink-features=AutomationControlled") # TMALL 反扒
options.add_argument('-ignore-certificate-errors')
options.add_argument('-ignore -ssl-errors')
# option.add_argument('-ignore-certificate-errors')
# option.add_argument('-ignore -ssl-errors')
if c.headless:
print("Headless mode")
print("无头模式")
# option.add_argument("--headless")
options.add_argument("--headless")
tmp_options = []
@ -2058,11 +2019,7 @@ if __name__ == '__main__':
shutil.copytree(absolute_user_data_folder, tmp_user_data_folder)
print("User data folder copied successfully, if you exit the program before it finishes, please delete the temporary user data folder manually.")
print("用户信息目录复制成功,如果程序在运行过程中被手动退出,请手动删除临时用户信息目录。")
# option = tmp_options[i]["option"]
options = tmp_options[i]["options"]
# option.add_argument(
# f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
# option.add_argument("--profile-directory=Default")
options.add_argument(
f'--user-data-dir={tmp_user_data_folder}') # TMALL 反扒
options.add_argument("--profile-directory=Default")
@ -2074,7 +2031,6 @@ if __name__ == '__main__':
threads = []
for i in range(len(c.ids)):
id = c.ids[i]
# option = tmp_options[i]["option"]
options = tmp_options[i]["options"]
print("id: ", id)
if c.read_type == "remote":
@ -2100,7 +2056,6 @@ if __name__ == '__main__':
cloudflare = 0
if cloudflare == 0:
options.add_argument('log-level=3') # 隐藏日志
# option.add_argument('log-level=3') # 隐藏日志
path = os.path.join(os.path.abspath("./"), "Data", "Task_" + str(id))
print("Data path:", path)
options.add_experimental_option("prefs", {
@ -2116,37 +2071,17 @@ if __name__ == '__main__':
'safebrowsing.disable_download_protection': True,
'profile.default_content_settings.popups': 0,
})
# option.add_experimental_option("prefs", {
# # 设置文件下载路径
# "download.default_directory": path,
# "download.prompt_for_download": False, # 禁止下载提示框
# "plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
# "download.directory_upgrade": True,
# "download.extensions_to_open": "applications/pdf",
# "plugins.always_open_pdf_externally": True, # 总是在外部程序中打开PDF
# "safebrowsing_for_trusted_sources_enabled": False,
# "safebrowsing.enabled": False,
# 'safebrowsing.enabled': False,
# 'safebrowsing.disable_download_protection': True,
# 'profile.default_content_settings.popups': 0,
# })
try:
if service["environment"] == 1:
# option.add_experimental_option(
# 'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
options.add_experimental_option(
'mobileEmulation', {'deviceName': 'iPhone X'}) # 模拟iPhone X浏览
except:
pass
# browser_t = MyChrome(
# options=options, chrome_options=option, executable_path=driver_path)
selenium_service = Service(executable_path=driver_path)
browser_t = MyChrome(service=selenium_service, options=options)
elif cloudflare == 1:
if sys.platform == "win32":
options.binary_location = "C:\\Program Files\\Google\\Chrome Beta\\Application\\chrome.exe" # 需要用自己的浏览器
# options.add_argument("--auto-open-devtools-for-tabs")
# options.binary_location = "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe" # 需要用自己的浏览器
browser_t = MyUCChrome(
options=options, driver_executable_path=driver_path)
links = list(filter(isnotnull, service["links"].split("\n")))
@ -2200,8 +2135,6 @@ if __name__ == '__main__':
# print("您的操作系统不支持暂停功能。")
# print("Your operating system does not support the pause function.")
# print("线程长度:", len(threads) )
for thread in threads:
print()
thread.join()

+ 1
- 1
ExecuteStage/generateEXE_win64.cmd Переглянути файл

@ -1,6 +1,6 @@
rmdir /s /q build
rmdir /s /q dist
@REM pyinstaller -F --icon=favicon.ico easyspider_executestage.py
pyinstaller -F --icon=favicon.ico --add-data "C:\Python311\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_shared.dll;onnxruntime\capi" --add-data "C:\Python311\Lib\site-packages\ddddocr\common.onnx;ddddocr" easyspider_executestage.py
pyinstaller -F --icon=favicon.ico --add-data "C:\Users\q9823\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\onnxruntime\capi\onnxruntime_providers_shared.dll;onnxruntime\capi" --add-data "C:\Users\q9823\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ddddocr\common.onnx;ddddocr" easyspider_executestage.py
del ..\ElectronJS\chrome_win64\easyspider_executestage.exe
copy dist\easyspider_executestage.exe ..\ElectronJS\chrome_win64\easyspider_executestage.exe

+ 136
- 61
ExecuteStage/myChrome.py Переглянути файл

@ -25,75 +25,150 @@ class MyChrome(webdriver.Chrome):
self.iframe_env = False # 现在的环境是root还是iframe
super().__init__(*args, **kwargs) # 调用父类的 __init__
def find_element(self, by=By.ID, value=None, iframe=False):
# 在这里改变查找元素的行为
if self.iframe_env:
super().switch_to.default_content()
self.iframe_env = False
if iframe:
# 获取所有的 iframe
# def find_element(self, by=By.ID, value=None, iframe=False):
# # 在这里改变查找元素的行为
# if self.iframe_env:
# super().switch_to.default_content()
# self.iframe_env = False
# if iframe:
# # 获取所有的 iframe
# try:
# iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
# except Exception as e:
# print(e)
# find_element = False
# # 遍历所有的 iframe 并查找里面的元素
# for iframe in iframes:
# # 切换到 iframe
# super().switch_to.default_content()
# super().switch_to.frame(iframe)
# self.iframe_env = True
# try:
# # 在 iframe 中查找元素
# # 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
# element = super().find_element(by=by, value=value)
# find_element = True
# except NoSuchElementException as e:
# print(f"No such element found in the iframe: {str(e)}")
# except Exception as e:
# print(f"Exception: {str(e)}")
# # 完成操作后切回主文档
# # super().switch_to.default_content()
# if find_element:
# return element
# if not find_element:
# raise NoSuchElementException
# else:
# return super().find_element(by=by, value=value)
def find_element_recursive(self, by, value, frames):
for frame in frames:
try:
iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
except Exception as e:
print(e)
find_element = False
# 遍历所有的 iframe 并查找里面的元素
for iframe in iframes:
# 切换到 iframe
super().switch_to.default_content()
super().switch_to.frame(iframe)
self.iframe_env = True
try:
# 在 iframe 中查找元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
element = super().find_element(by=by, value=value)
find_element = True
except NoSuchElementException as e:
print(f"No such element found in the iframe: {str(e)}")
except Exception as e:
print(f"Exception: {str(e)}")
# 完成操作后切回主文档
# super().switch_to.default_content()
if find_element:
self.switch_to.frame(frame)
except StaleElementReferenceException:
# If the frame has been refreshed, we need to switch to the parent frame first,
self.switch_to.parent_frame()
self.switch_to.frame(frame)
try:
# !!! Attempt to find the element in the current frame, not the context (iframe environment will not change to default), therefore we use super().find_element instead of self.find_element
element = super(MyChrome, self).find_element(by=by, value=value)
return element
if not find_element:
raise NoSuchElementException
else:
return super().find_element(by=by, value=value)
except NoSuchElementException:
# Recurse into nested iframes
nested_frames = super(MyChrome, self).find_elements(By.CSS_SELECTOR, "iframe")
if nested_frames:
element = self.find_element_recursive(by, value, nested_frames)
if element:
return element
except Exception as e:
print(f"Exception while processing frame: {e}")
def find_elements(self, by=By.ID, value=None, iframe=False):
# 在这里改变查找元素的行为
if self.iframe_env:
super().switch_to.default_content()
self.iframe_env = False
raise NoSuchElementException(f"Element {value} not found in any frame or iframe")
def find_element(self, by=By.ID, value=None, iframe=False):
self.switch_to.default_content() # Switch back to the main document
self.iframe_env = False
if iframe:
# 获取所有的 iframe
iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
find_element = False
# 遍历所有的 iframe 并找到里面的元素
for iframe in iframes:
# 切换到 iframe
frames = self.find_elements(By.CSS_SELECTOR, "iframe")
if not frames:
raise NoSuchElementException(f"No iframes found in the current page while searching for {value}")
self.iframe_env = True
return self.find_element_recursive(by, value, frames)
else:
# Find element in the main document as normal
return super(MyChrome, self).find_element(by=by, value=value)
# def find_elements(self, by=By.ID, value=None, iframe=False):
# # 在这里改变查找元素的行为
# if self.iframe_env:
# super().switch_to.default_content()
# self.iframe_env = False
# if iframe:
# # 获取所有的 iframe
# iframes = super().find_elements(By.CSS_SELECTOR, "iframe")
# find_element = False
# # 遍历所有的 iframe 并找到里面的元素
# for iframe in iframes:
# # 切换到 iframe
# try:
# super().switch_to.default_content()
# super().switch_to.frame(iframe)
# self.iframe_env = True
# # 在 iframe 中查找元素
# # 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
# elements = super().find_elements(by=by, value=value)
# if len(elements) > 0:
# find_element = True
# # 完成操作后切回主文档
# # super().switch_to.default_content()
# if find_element:
# return elements
# except NoSuchElementException as e:
# print(f"No such element found in the iframe: {str(e)}")
# except Exception as e:
# print(f"Exception: {str(e)}")
# if not find_element:
# raise NoSuchElementException
# else:
# return super().find_elements(by=by, value=value)
def find_elements_recursive(self, by, value, frames):
for frame in frames:
try:
try:
super().switch_to.default_content()
super().switch_to.frame(iframe)
self.iframe_env = True
# 在 iframe 中查找元素
# 在这个例子中,我们查找 XPath 为 '//div[1]' 的元素
elements = super().find_elements(by=by, value=value)
if len(elements) > 0:
find_element = True
# 完成操作后切回主文档
# super().switch_to.default_content()
if find_element:
self.switch_to.frame(frame)
except StaleElementReferenceException:
# If the frame has been refreshed, we need to switch to the parent frame first,
self.switch_to.parent_frame()
self.switch_to.frame(frame)
# Directly find elements in the current frame
elements = super(MyChrome, self).find_elements(by=by, value=value)
if elements:
return elements
# Recursively search for elements in nested iframes
nested_frames = super(MyChrome, self).find_elements(By.CSS_SELECTOR, "iframe")
if nested_frames:
elements = self.find_elements_recursive(by, value, nested_frames)
if elements:
return elements
except NoSuchElementException as e:
print(f"No such element found in the iframe: {str(e)}")
except Exception as e:
print(f"Exception: {str(e)}")
if not find_element:
raise NoSuchElementException
except Exception as e:
print(f"Exception while processing frame: {e}")
raise NoSuchElementException(f"Elements with {value} not found in any frame or iframe")
def find_elements(self, by=By.ID, value=None, iframe=False):
self.switch_to.default_content() # Switch back to the main document
self.iframe_env = False
if iframe:
frames = self.find_elements(By.CSS_SELECTOR, "iframe")
if not frames:
return [] # Return an empty list if no iframes are found
self.iframe_env = True
return self.find_elements_recursive(by, value, frames)
else:
return super().find_elements(by=by, value=value)
# Find elements in the main document as normal
return super(MyChrome, self).find_elements(by=by, value=value)
# MacOS不支持直接打包带Cloudflare的功能,如果要自己编译运行,可以把这个if去掉,然后配置好浏览器和driver路径
if sys.platform != "darwin":

+ 2
- 2
Extension/manifest_v3/package-lock.json Переглянути файл

@ -1,12 +1,12 @@
{
"name": "EasySpider",
"version": "0.5.0",
"version": "0.6.0",
"lockfileVersion": 3,
"requires": true,
"packages": {
"": {
"name": "EasySpider",
"version": "0.5.0",
"version": "0.6.0",
"license": "AGPL-3.0",
"dependencies": {
"crx": "^5.0.1",

+ 1
- 1
Extension/manifest_v3/src/content-scripts/config.json Переглянути файл

@ -1 +1 @@
{"language":"zh"}
{"language":"en"}

Завантаження…
Відмінити
Зберегти