大家好,您需要通过网站上的分页。
问题出现在第30页某处,加载货物的动画无限期挂起,因此无法执行任何操作。
此问题仅发生在 Selenium 中。
如果我自己浏览浏览器,一切都会好起来的。什么可能是这样的问题?
from selenium import webdriver
from selenium.common.exceptions import *
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
import time
def get_index_develop():
global index_develop
index_develop += 1
return index_develop
def get_translated_text(text):
return text
def init_driver():
ff = "../install/geckodriver.exe"
# chrome_option = webdriver.ChromeOptions()
# # chrome_option.add_argument("headless")
# prefs = {"profile.managed_default_content_settings.images": 2}
# chrome_option.add_experimental_option("prefs", prefs)
try:
driver = webdriver.Firefox(executable_path=ff)
# driver = webdriver.Chrome(executable_path=ff, options=chrome_option)
# driver = webdriver.Chrome(executable_path=ff, chrome_options=chrome_option, service_args=service_args)
except SessionNotCreatedException:
print("Ошибка инициализации браузера. Скорее всего у вас не установлен браузер. Пожалуйста обратитесь к разработчику парсера")
return driver
def close_pop_up_window(driver):
blocks = driver.find_elements_by_css_selector(
"div.b-popup.js-popup >div > div.b-popup__header.js-popup__header > div")
for block in blocks:
try:
block.click()
break
except:
continue
time.sleep(1)
def parse_list_projects(driver):
urls = []
driver.get("https://www.hurriyetemlak.com/projeler/projects")
# Блок пагинации
while True:
close_pop_up_window(driver)
refresher = WebDriverWait(driver, 300).until(EC.invisibility_of_element_located((By.CSS_SELECTOR, "div.b-scroll.js-search-left-content.js-preload-parent.b-preload-block.load")))
items = driver.find_elements_by_css_selector("div.b-snippet__wrapper.js-complex__wrapper")
for item in items:
href = item.get_attribute("data-href")
print("Найдена ссылка на проект", href)
urls.append(href)
try:
pagination_block = WebDriverWait(driver, 10).until(EC.visibility_of_element_located(
(By.CSS_SELECTOR, "span.b-pagination__item.b-pagination__item--next.js-pagination-next")))
pagination_block.click()
print("Перешли на следующую страницу")
except Exception as e:
try:
print("Проверка наличия всплываюшего окна")
button_close = driver.find_element_by_css_selector(
"button.b-button.b-button--full.b-button--confirm")
time.sleep(2)
button_close.click()
time.sleep(2)
print("Окно закрыли")
pagination_block = WebDriverWait(driver, 10).until(EC.visibility_of_element_located(
(By.CSS_SELECTOR, "span.b-pagination__item.b-pagination__item--next.js-pagination-next")))
pagination_block.click()
print("Нажатие на pagination снова")
except Exception as e:
try:
close_pop_up_window(driver)
pagination_block = WebDriverWait(driver, 15).until(EC.visibility_of_element_located(
(By.CSS_SELECTOR, "span.b-pagination__item.b-pagination__item--next.js-pagination-next")))
pagination_block.click()
except:
print("Pagination не найдены. Конец перехода между страницами", e)
break
return urls, driver
if __name__ == '__main__':
start = time.time()
driver = init_driver()
urls, driver = parse_list_projects(driver)
print("Парсинг проектов")
print("Парсинг окончен. Время выполнения", time.time() - start)
refresher 只是在数据加载期间指示的元素
好问题,有趣!由于站点具有针对过于频繁的请求的保护,因此加载挂起。在第三十个请求中,它给出了代码
429 Too Many Requests。在查看了网站在浏览器中打开之前立即给出的内容后,我决定为了保护 CloudFlare,每个新会话都单独计算。事情就这样发生了。
解决方案是每隔 20 页关闭浏览器,然后再次打开。
我还稍微清理了您关于翻译页面和关闭弹出对话框的逻辑。
这是发生的事情:
相反,它
next_page被使用goto_page,它在两种模式下工作:另外,当我打开浏览器时,我接受了 cookie 请求。在加载每张纸时 - 我向下滚动它。也许这不是必需的 - 您可以自己进一步调试。
另外,我更喜欢 Google Chrome,但我相信它也可以在 FireFox 下工作。
在写答案时 - 脚本被打到最后。总时间 - 1094 秒
PS 前两行代码用于我的控制台上的西里尔文输出......