下面是代码。逐个链接解析站点页面,收集其上的所有链接并写入文件。如何使解析器随后跟踪每个链接并解析该页面上的所有链接并添加到文件中。最终目标是从整个站点收集链接。如果可能的话,举个例子。这是代码
def get_books() -> typing.List[str]:
rs = requests.get('https://......')
root = BeautifulSoup(rs.text, 'html.parser')
t = []
for x in root.findAll('a'):
try:
t1 = x["href"].strip()
if not 'http' in t1:
t1 = 'https://....' + t1
t.append(t1)
except KeyError:
pass
return t
if __name__ == '__main__':
books = get_books()
f = io.open('parsed_data.htm', 'w', encoding='utf8')
for line in books:
f.write(line + "\n")
f.close()
tf = io.open('text_new.txt', 'w', encoding='utf8')
with open('parsed_data.htm', 'r') as f:
file = f.readlines()
seen_item = []
for line in file:
if line not in seen_item:
seen_item.append(line)
tf.write(line)
tf.close()
1 个回答