欢迎光临
感谢一路有你

py 采集流程整理

过程

  1. 获取列表链接
  2. 打开链接获取详情
  3. 处理数据
  4. 保存数据
  5. 对数据进行处理
  6. 处理杂质
  7. 可视化数据
import urllib

import pandas as pd
import requests
from lxml import etree
import os
import csv
import time
import pymysql

# 全局取消证书验证
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


# 获取链接
def get_url_list(num):
    all_num=0
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'}

    arr = range(1, num+1)
    print(arr)
    requests.adapters.DEFAULT_RETRIES = 5

    # request  持久化
    for i in arr:
        url_ = 'http://www.***.com/photo/list/?page={}'.format(i)
        res = requests.get(url_, headers=headers)
        res = etree.HTML(res.content.decode())
        nodes = res.xpath('//ul[@class="list-group"]/li/div/div/a')
        info = []
        k=0
        for node in nodes:
            item = {}
            title = node.xpath('//*[@id="pic-detail"]/div/div[2]/div[2]/ul/li/div/div/a/p/text()')[k]
            url = node.xpath('//*[@id="pic-detail"]/div/div[2]/div[2]/ul/li/div/div/a/@href')[k]
            img = node.xpath('//*[@id="pic-detail"]/div/div[2]/div[2]/ul/li/div/div/a/img/@data-backup')[k]
            item['title'] = title
            item['url'] = url
            item['pic'] = img
            item['pic_new'] = img.replace('http://img.***.com','http://oss1.wangmingchang.com/0bd86e854d29ca97c3510e774d9cd4d4/uploads',1)
            item['status'] = 0
            # info.append(item)
            print(item)
            db_insert(item)
            all_num = all_num+1
            k=k+1;
            # print("当前第" +  str(all_num) +"个")
        # write_to_file(info)
        print("====第"+ format(i)+"页已完成====")
        # time.sleep( 2 )
    print("OK")

# 保存数据
def write_to_file(info):
    with open('采集链接列表.csv','a',newline='') as f:
       fieldnames = ['title','url','img','status']
       writer = csv.DictWriter(f,fieldnames=fieldnames)
       writer.writeheader()
       try:
           writer.writerows(info)
       except:
           pass

# //清洗数据
def handle_data():
    data = pd.read_csv("采集链接列表-2.csv")  # 读取csv文件
    # print(data)                                #打印所有文件
    # print(data.head(5))  # 打印前5行
    # print(data.columns)  # 返回全部列名
    # print(data.shape)  # f返回csv文件形状(65, 5)
    # print(data.loc[1:2])  # 打印第1到2行
    # print(data.loc[2:4, ['title','time', 'url']])  # 打印行中特定列

    # df = pd.DataFrame(data)
    # print(df)
    # df['title'].drop_duplicates(keep='last')
    newdata = data.drop_duplicates(subset=['title', 'url'], keep='first')

    print(newdata);

    # 清洗后保存
    # df = newdata.to_csv('清洗去重后的数据.csv', sep=',', header=True, index=True)
    # print(df)


        # save_img(url)

# 保存图片
def save_img(img_url):
    #保存图片到磁盘文件夹 file_path中,默认为当前脚本运行目录下的 book\img文件夹
    file_path = os.path.dirname(img_url)
    opener = urllib.request.build_opener()
    opener.addheaders = [('User-Agent',
                          'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
    urllib.request.install_opener(opener)
    try:
        if not os.path.exists(file_path):
            print("创建目录...")
            os.makedirs(file_path)
        #获得图片后缀
        file_suffix = os.path.splitext(img_url)[1]
        file_path2 = os.path.splitext(img_url)[0]
        #拼接图片名(包含路径)
        filename = '{}{}'.format(file_path2,file_suffix)
       #下载图片,并保存到文件夹中
        urllib.request.urlretrieve(img_url,filename=filename)
        return 1
    except IOError as e:
        # return '文件操作失败',e
        return 0
    except Exception as e:
        # return '错误 :',e
        return 0
# 操作数据库
def db_insert(data):
    # print("==执行数据库插入==")
    # 选择TEST数据库,连接MySQL
    db = pymysql.connect(host="localhost", port=3306, user="root", password="",db="py_test", charset="utf8")
    cursor = db.cursor()
    is_cz = db_is_cz(db,data)
    if(is_cz==1):
        # 这里的sql语句不能用%或者+号作为连接符,否则会报错
        # params表需事先创建
        sql = 'INSERT INTO list(title, pic,pic_new, url, status) VALUES (%s, %s, %s, %s,%s)'
        parm = (data['title'], data['pic'], data['pic_new'], data['url'], data['status'])
        try:
            db.ping(reconnect=True)# 避免无法连接数据
            cursor.execute(sql, parm)
            db.commit()
            print("ok")
        except:
            db.rollback()
            print("error")
        db.close()
    else:
        print("数据已存在..")
#     判断数据是否存在
def db_is_cz(db,data):
    try:
        with db.cursor() as cursor:
            sql = 'select * from list where title= %s and url= %s and pic= %s'
            parm = (data['title'], data['url'], data['pic'])
            db.ping(reconnect=True)
            cursor.execute(sql,parm)
            result=cursor.fetchone()
            if(result):
                return 0
            else:
                return 1
    except:
        db.rollback()
        print("error")
    # finally:
        db.close()
    # 转换成DataFrame格式
    # df = pd.DataFrame(result)
    # print(df)

def test():
    # for i in range(1, 5):
    #     print(i)
    #     # 1
    #     # 2
    #     # 3
    #     # 4
    return 0

if __name__ == '__main__':
    # handle_data()
    # ret = save_img('http://img.***.com/production/uploads/image/2020/02/16/20200216833483_mpGroi.jpg')
    # print(ret)
    # str = 'http://img.***.com/production/uploads/image/2020/02/16/20200216833483_mpGroi.jpg'
    # str = str.replace('http://img.***.com/',' ')
    # print(str)
    # db_cz()
    list = get_url_list(5)




    # test()




赞(0) 打赏
未经允许不得转载:王明昌博客 » py 采集流程整理
分享到: 更多 (0)

觉得文章有用就打赏一下文章作者

支付宝扫一扫打赏

微信扫一扫打赏