如果你对该文章中的内容有疑问/不解,可以点击此处链接提问
要注明问题和此文章链接地址 点击此处跳转
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import requests class TiebaSpider(): def __init__(self,tieba_name): self.tieba_name = tieba_name print(self.tieba_name) self.url_temp = "https://tieba.baidu.com/f?kw="+tieba_name+"&ie=utf-8&pn={}" self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"} def get_url_list(self): # url_list = [] # for i in range(1000): # url_list.append(self.url_temp.format(i*50)) # return url_list # 等同于如下 return [self.url_temp.format(i*50) for i in range(1000) ] def parse_url(self,url): print(url) response = requests.get(url,headers = self.headers ) return response.content.decode() def save_html(self,html_str,page_num): tieba_name = self.tieba_name file_path = "{}-{}页.HTML".format(tieba_name,page_num) with open(file_path,"w",encoding="utf-8") as f: f.write(html_str) # 1.构建url列表 #2.遍历发布请求 #3.保存 def run(self): url_list = self.get_url_list() for url in url_list: html_str = self.parse_url(url) page_num = url_list.index(url)+1 self.save_html(html_str,page_num) if __name__ == '__main__': tieba_spider = TiebaSpider("李毅") tieba_spider.run() |