如果你对该文章中的内容有疑问/不解,可以点击此处链接提问
要注明问题和此文章链接地址 点击此处跳转
sign token获取不到可测试手机端
百度翻译(案例)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 |
import requests header = { 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Mobile Safari/537.36'} data = { 'from': 'zh', 'to': 'en', 'query': '王明昌博客', } post_url = "http://fanyi.baidu.com/basetrans" r = requests.post(post_url, data=data, headers=header) print(r.content.decode()) |
获取简书个人中心文章

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 |
import requests from lxml import etree import os headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'} for i in range(1,4): url_ = 'https://www.jianshu.com/u/4642b9fae22c?order_by=shared_at&page={}'.format(i) res = requests.get(url_,headers=headers) res = etree.HTML(res.content.decode()) nodes = res.xpath('//ul[@class="note-list"]/li') for node in nodes: item = {} title = node.xpath('.//a[@class="title"]/text()') time = node.xpath('.//span[@class="time"]/@data-shared-at')[0] abstract = node.xpath('.//p[@class="abstract"]/text()')[0] img = node.xpath('.//img[@class=" img-blur-done"]') url = 'https://www.jianshu.com'+node.xpath('.//a/@href')[0] item['title'] = title item['time'] = time item['url'] = url item['abstract'] = title item['img'] = time print(item) |
下载图片
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
#_*_coding:utf-8_*_ import requests import re import os class GetImage(object): def __init__(self,url): self.url = url self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' } self.dir_path = os.path.dirname(os.path.abspath(__file__)) self.path = self.dir_path+'/imgs' isExists = os.path.exists(self.dir_path+'/imgs') # 创建目录 if not isExists: os.makedirs(self.path) def download(self,url): try: res = requests.get(url,headers=self.headers) return res except Exception as E: print(url+'下载失败,原因:'+E) def parse(self,res): content = res.content.decode() # print(content) img_list = re.findall(r'<img.*?src="(.*?)"',content,re.S) img_list = ['http://www.yangqq.com/skin/jxhx/'+url for url in img_list] return img_list def save(self,res_img,file_name): if res_img: with open(file_name,'wb') as f: f.write(res_img.content) print(url+'下载成功') def run(self): # 下载 res = self.download(self.url) # 解析 url_list = self.parse(res) # 下载图片 for url in url_list: res_img = self.download(url) name = url.strip().split('/').pop() file_name = self.path+'/'+name # 保存 self.save(res_img,file_name) if __name__ == '__main__': url_list = ['https://www.yangqq.com/skin/jxhx/', 'https://www.yangqq.com/skin/jxhx/list.html', 'https://www.yangqq.com/skin/jxhx/share.html', 'https://www.yangqq.com/skin/jxhx/list2.html', 'https://www.yangqq.com/skin/jxhx/list3.html', 'https://www.yangqq.com/skin/jxhx/daohang.html', 'https://www.yangqq.com/skin/jxhx/about.html'] for url in url_list: text = GetImage(url) text.run() |
获取魔童降世影评,并保存CSV
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
#_*_coding:utf-8_*_ import requests import re import csv import time from lxml import etree def get_one_page(url): try: headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'} response = requests.get(url,headers=headers,timeout=10) if response.status_code == 200: # print(response.text) return response return None except EOFError as e: print(e) return None def parse_one_page(res,info): info = [] res = etree.HTML(res.content.decode()) nodes_list = res.xpath('//div[@class="comment-item"]') for node in nodes_list: comic = {} comic['User'] = node.xpath('.//span[@class="comment-info"]/a/text()')[0].strip() comic['Time'] = node.xpath('.//span[@class="comment-info"]/span[3]/text()')[0].strip() comic['Comment'] = node.xpath('.//span[@class="short"]/text()')[0].strip() print(comic) info.append(comic) return info def write_to_file(info): with open('《哪吒之魔童降世》短评.csv','a',newline='') as f: fieldnames = ['User','Time','Comment'] writer = csv.DictWriter(f,fieldnames=fieldnames) writer.writeheader() try: writer.writerows(info) except: pass def main(start): info = {} url = 'https://movie.douban.com/subject/26794435/comments?start=' + str(start) + '&limit=20&sort=new_score&status=P&percent_type=' html = get_one_page(url) data = parse_one_page(html,info) write_to_file(data) if __name__ == '__main__': for i in range(10): main(i*20) print('第{}本页采集完毕。'.format(str(i))) # 采集完一页后的标识 time.sleep(1) # 采集完一页休息一秒 |
代理
1 2 3 4 |
proxies = {"http":"http://27.152.90.200:80"} header = {} request.get("http://www.baidu.com",proxies=proxies,headers=header) |