利用Python爬取www.netbian.com的图片

literature

2020 年 07 月 17 日

1940 次浏览

暂无评论

5542字数

主机教程

介绍：代码仅作学习使用，如若使用本站代码进行商业用途等牟利带来的后果，本站不负有责任

彼岸桌面

彼岸桌面

爬取代码

注意事项

依赖：requests、bs4、random、time
修改：你的下载目录路径

'''*************************************************
Copyright (Python), 2020-,Literature Tech. Co., Ltd.
source:    None
Author:    Written by Literature
Version:   1.0
Date:      2020.07.17
Description:  
Others:   None
Function List:  main
History:  The first edition 2020.05.19
*************************************************'''
import requests
from bs4 import BeautifulSoup
import os
import time
import random

import UserAgent

index = 'http://www.netbian.com' # 网站根地址
interval = 0.1 # 爬取图片的间隔时间
firstDir = '下载目录路径' # 总路径
classificationDict = {} # 存放网站分类子页面的信息


# 获取页面筛选后的内容列表
def screen(url, select):
    headers = UserAgent.get_headers() # 随机获取一个headers
    html = requests.get(url = url, headers = headers)
    html.encoding = 'gbk' # 网站的编码
    html = html.text
    soup = BeautifulSoup(html, 'lxml')
    return soup.select(select)

# 获取页码
def screenPage(url, select):
    html = requests.get(url = url, headers = UserAgent.get_headers())
    html.encoding = 'gbk'
    html = html.text
    soup = BeautifulSoup(html, 'lxml')
    return soup.select(select)[0].next_sibling.text

# 下载操作
def download(src, name, path):
    if(isinstance(src, str)):
        response = requests.get(src)
        path = path + '/' + name + '.jpg'
        while(os.path.exists(path)): # 若文件名重复
            path = path.split(".")[0] + str(random.randint(2, 17)) + '.' + path.split(".")[1]
        with open(path,'wb') as pic:
            for chunk in response.iter_content(128):
                pic.write(chunk)


# 定位到 1920 1080 分辨率图片               
def handleImgs(links, path):
    for link in links:
        href = link.get('href')
        if(href == 'http://pic.netbian.com/'): # 过滤图片广告
            continue

        # 第一次跳转
        if('http://' in href): # 有极个别图片不提供正确的相对地址
            url = href
        else:
            url = index + href
        select = 'div#main div.endpage div.pic div.pic-down a'
        link = screen(url, select)
        if(link == []):
            print(url + ' 无此图片，爬取失败')
            continue
        href = link[0].get('href')

        # 第二次跳转
        url = index + href

        # 获取到图片了
        select = 'div#main table a img'
        link = screen(url, select)
        if(link == []):
            print(url + " 该图片需要登录才能爬取，爬取失败")
            continue
        name = link[0].get('alt').replace('\t', '').replace('|', '').replace(':', '').replace('\\', '').replace('/', '').replace('*', '').replace('?', '').replace('"', '').replace('<', '').replace('>', '')
        print(name) # 输出下载图片的文件名
        src = link[0].get('src')
        if(requests.get(src).status_code == 404):
            print(url + ' 该图片下载链接404，爬取失败')
            print()
            continue
        print()
        download(src, name, path)
        time.sleep(interval)


# 选择下载分类子页面
def select_classification(choice):
    print('---------------------------')
    print('--------------' + choice + '-------------')
    print('---------------------------')
    secondUrl = classificationDict[choice]['url']
    secondDir = classificationDict[choice]['path']
    
    if(not os.path.exists(secondDir)):
        os.mkdir(secondDir) # 创建分类目录
    
    select = '#main > div.page > span.slh'
    pageIndex = screenPage(secondUrl, select)
    lastPagenum = int(pageIndex) # 获取最后一页的页码
    for i in range(0, lastPagenum):
        if i == 0:
            url = secondUrl
        else:
            url = secondUrl + 'index_%d.htm' %(i+1)
        
        print('--------------' + choice + ': ' + str(i+1) + '-------------')
        path = secondDir + '/' + str(i+1)
        if(not os.path.exists(path)):
            os.mkdir(path) # 创建分类目录下页码目录

        select = 'div#main div.list ul li a'
        links = screen(url, select)
        handleImgs(links, path)


# ui界面，用户选择下载分类
def ui():
    print('--------------netbian-------------')
    print('全部', end=' ')
    for c in classificationDict.keys():
        print(c, end=' ')
    print()
    choice = input('请输入分类名：')
    if(choice == '全部'):
        for c in classificationDict.keys():
            select_classification(c)
    elif(choice not in classificationDict.keys()):
        print("输入错误，请重新输入！")
        print('----')
        ui()
    else:
        select_classification(choice)


# 将分类子页面信息存放在字典中
def init_classification():
    url = index
    select = '#header > div.head > ul > li:nth-child(1) > div > a'
    classifications = screen(url, select)
    for c in classifications:
        href = c.get('href') # 获取的是相对地址
        text = c.string # 获取分类名
        if(text == '4k壁纸'): # 4k壁纸，因权限问题无法爬取，直接跳过
            continue
        secondDir = firstDir + '/' + text # 分类目录
        url = index + href # 分类子页面url
        global classificationDict
        classificationDict[text] = {
            'path': secondDir,
            'url': url
        }


def main():
    if(not os.path.exists(firstDir)):
        os.mkdir(firstDir) # 创建总目录
    init_classification()
    ui()

    
if __name__ == '__main__':
    main()

代码运行效果

[1]: https://jscdn.cachefly.net/web/wxiou/20200717114527.png [2]: http://www.netbian.com/ [3]: https://jscdn.cachefly.net/web/wxiou/20200717115139.png [4]: https://jscdn.cachefly.net/web/wxiou/20200717115644.png

利用Python爬取www.netbian.com的图片