为什么python代码中无法读取并打印word文件的内容:
import bs4
import time
import random
import requests
import docx
from bs4 import BeautifulSoup
from requests_html import HTMLSession
import magic
import chardet
import codecs
from io import BytesIO
from docx import Document
from selenium import webdriver # pip install selenium
# Список пользовательских агентов
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.4 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
]
# Функция для получения случайного пользовательского агента
def get_random_user_agent():
return random.choice(user_agents)
# Настройка браузера
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument(f"user-agent={get_random_user_agent()}")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
data = []
# Использование webdriver
session = HTMLSession()
response = session.get(
"https://mos-gorsud.ru/mgs/search?caseDateFrom=16.02.2023&caseDateTo=28.02.2023&courtAlias=mgs&documentStatus=2&processType=6&formType=fullForm&page=2")
time.sleep(3) # Дополнительная задержка на случай, если нужно, но избегайте чрезмерного использования sleep
soup = BeautifulSoup(response.text, 'html.parser')
heads = soup.find('table', class_='custom_table').find_all('tr')
print(len(heads))
for head in heads[1:]:
link = 'https://mos-gorsud.ru' + head.find('nobr').find('a')['href']
print(link)
loom = session.get(link)
abble = BeautifulSoup(loom.text, 'html.parser')
documents = abble.find('table', {'class': 'custom_table mainTable'}).find('tbody').find_all('tr')
for document in documents:
if "Приговор" in document.text:
score = document.find_all('td')
print(len(score))
for soc in score:
stock = soc.find_all('a')
for sto in stock:
print('Prigovor: ' + 'https://mos-gorsud.ru' + sto['href'])
link_doc = 'https://mos-gorsud.ru' + sto['href']
response = requests.get(link_doc, get_random_user_agent())
# Проверка успешности запроса
if response.status_code == 200:
# Сохранение файла на диск
with open('prigovor.docx', 'wb') as file:
file.write(response.content)
# Открытие Word-документа и извлечение текста
document = Document('prigovor.docx')
text = '\n'.join([paragraph.text for paragraph in document.paragraphs])
resheniye = ' '.join(text.split())
# Вывод ссылки и текста
print('Ссылка на файл: https://mos-gorsud.ru' + sto['href'])
print(resheniye)
else:
print(f"Error downloading file: {response.status_code}")
elif "Постановление суда апелляционной инстанции" in document.text:
score = document.find_all('td')
print(len(score))
for soc in score:
stock = soc.find_all('a')
for sto in stock:
print('Postanovleniye : ' + 'https://mos-gorsud.ru' + sto['href'])
link_pod = 'https://mos-gorsud.ru' + sto['href']
response = requests.get(link_pod, get_random_user_agent())
# Проверка успешности запроса
if response.status_code == 200:
# Сохранение файла на диск
with open('resheniye.docx', 'wb') as file:
file.write(response.content)
# Открытие Word-документа и извлечение текста
document = Document('resheniye.docx')
text = '\n'.join([paragraph.text for paragraph in document.paragraphs])
postanov = ' '.join(text.split())
# Вывод ссылки и текста
print('Ссылка на файл: https://mos-gorsud.ru' + sto['href'])
print(postanov)
else:
print(f"Error downloading file: {response.status_code}")
print('\n')
写道:
Traceback (most recent call last):
File "C:\Users\user\PycharmProjects\cases_pars\little.py", line 143, in <module>
document = Document('resheniye.docx')
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\user\PycharmProjects\cases_pars\.venv\Lib\site-packages\docx\api.py", line 27, in Document
document_part = cast("DocumentPart", Package.open(docx).main_document_part)
^^^^^^^^^^^^^^^^^^
File "C:\Users\user\PycharmProjects\cases_pars\.venv\Lib\site-packages\docx\opc\package.py", line 127, in open
pkg_reader = PackageReader.from_file(pkg_file)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\user\PycharmProjects\cases_pars\.venv\Lib\site-packages\docx\opc\pkgreader.py", line 22, in from_file
phys_reader = PhysPkgReader(pkg_file)
^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\user\PycharmProjects\cases_pars\.venv\Lib\site-packages\docx\opc\phys_pkg.py", line 21, in __new__
raise PackageNotFoundError("Package not found at '%s'" % pkg_file)
docx.opc.exceptions.PackageNotFoundError: Package not found at 'resheniye.docx'