python爬虫实战

python

Word count: 1.6kReading time: 7 min

 2019/02/15 

中国大学排名定向排
爬取网络图片并保存本地
使用requests爬取任意输入的百度贴吧的网页，并保存到本地
抓取百度贴吧-生活大爆炸吧的基本内容
爬取豆瓣图书名
爬取百度百科1000个页面的数据
- 思路
- 代码

中国大学排名定向排

步骤

1.从网络上获取大学排名网页内容 getHTMLText()
2.提取网页内容中信息到合适的数据结构(列表的形式) fillUnivList()
3.利用数据结构展示并输出结果 printUnivList()

代码

爬取网络图片并保存本地

# coding=utf-8
import requests
import os
root="F://pics//"
url="http://a3.att.hudong.com/14/75/01300000164186121366756803686.jpg"
path=root+url.split('/')[-1]    # 使图片名字和url的图片名字一致
try:
    if not os.path.exists(root):
        os.mkdir(root)
    if not os.path.exists(path):
        r=requests.get(url)
        with open(path,'wb') as f:
            f.write(r.content)
            f.close()
            print('文件保存成功')
    else:
        print("文件已存在")
except:
    print("爬取失败")

最终文件保存F盘pics文件夹下

使用requests爬取任意输入的百度贴吧的网页，并保存到本地

#coding=utf-8
import requests

class TiebaSpider:
    def __init__(self, tieba_name):
        self.tieba_name = tieba_name
        self.url_tmp = 'https://tieba.baidu.com/f?kw=' + self.tieba_name + '&ie=utf-8&pn={}'
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134"}

    def get_url_list(self):  # 获取该贴吧所有的url地址并存放在列表中
        url_list = []
        for i in range(1000):
            url_list.append(self.url_tmp.format(i + 50))
        return url_list

    def parse_url(self, url):  # 解析url,获得响应的页面内容
        response = requests.get(url, headers=self.headers)
        return response.content.decode()  # 默认是utf-8解码

    def save_html(self, html_str, page_num):
        file_path = 'html/{}-第{}页.html'.format(self.tieba_name, page_num)
        with open(file_path, 'w', encoding='utf-8') as f:  # 此处一定要加encoding=‘utf8'否则会报错，默认打开是以ASCII码方式，而解码是以utf8解码
            f.write(html_str)

    def run(self):
        url_list = self.get_url_list()
        for url in url_list:
            html_str = self.parse_url(url)
            page_num = url_list.index(url) + 1
            self.save_html(html_str, page_num)

if __name__ == '__main__':
    tiebaspider = TiebaSpider('李毅')
    tiebaspider.run()

python爬虫实战

抓取百度贴吧-生活大爆炸吧的基本内容

'''
抓取百度贴吧---生活大爆炸吧的基本内容
爬虫线路： requests - bs4
Python版本： 3.6
'''
import requests
import time
from bs4 import BeautifulSoup

# 首先我们写好抓取网页的函数
def get_html(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        # 这里我们知道百度贴吧的编码是utf-8，所以手动设置的。爬去其他的页面时建议使用：
        # r.endcodding = r.apparent_endconding
        r.encoding = 'utf-8'
        return r.text
    except:
        return " ERROR "

def get_content(url):
    '''
    分析贴吧的网页文件，整理信息，保存在列表变量中
    '''

    # 初始化一个列表来保存所有的帖子信息：
    comments = []
    # 首先，我们把需要爬取信息的网页下载到本地
    html = get_html(url)

    # 我们来做一锅汤
    soup = BeautifulSoup(html, 'lxml')

    # 按照之前的分析，我们找到所有具有‘ j_thread_list clearfix’属性的li标签。返回一个列表类型。
    liTags = soup.find_all('li', attrs={'class': ' j_thread_list clearfix'})

    # 通过循环找到每个帖子里的我们需要的信息：
    for li in liTags:
        # 初始化一个字典来存储文章信息
        comment = {}
        # 这里使用一个try except 防止爬虫找不到信息从而停止运行
        try:
            # 开始筛选信息，并保存到字典中
            comment['title'] = li.find(
                'a', attrs={'class': 'j_th_tit '}).text.strip()
            comment['link'] = "http://tieba.baidu.com/" + \
                li.find('a', attrs={'class': 'j_th_tit '})['href']
            comment['name'] = li.find(
                'span', attrs={'class': 'tb_icon_author '}).text.strip()
            comment['time'] = li.find(
                'span', attrs={'class': 'pull-right is_show_create_time'}).text.strip()
            comment['replyNum'] = li.find(
                'span', attrs={'class': 'threadlist_rep_num center_text'}).text.strip()
            comments.append(comment)
        except:
            print('出了点小问题')

    return comments

def Out2File(dict):
    '''
    将爬取到的文件写入到本地
    保存到当前目录的 TTBT.txt文件中。

    '''
    with open('TTBT.txt', 'a+') as f:
        for comment in dict:
            f.write('标题： {} \t 链接：{} \t 发帖人：{} \t 发帖时间：{} \t 回复数量： {} \n'.format(
                comment['title'], comment['link'], comment['name'], comment['time'], comment['replyNum']))

        print('当前页面爬取完成')

def main(base_url, deep):
    url_list = []
    # 将所有需要爬去的url存入列表
    for i in range(0, deep):
        url_list.append(base_url + '&pn=' + str(50 * i))
    print('所有的网页已经下载到本地！ 开始筛选信息。。。。')

    #循环写入所有的数据
    for url in url_list:
        content = get_content(url)
        Out2File(content)
    print('所有的信息都已经保存完毕！')

base_url = 'http://tieba.baidu.com/f?kw=%E7%94%9F%E6%B4%BB%E5%A4%A7%E7%88%86%E7%82%B8&ie=utf-8'
# 设置需要爬取的页码数量
deep = 3
if __name__ == '__main__':
    main(base_url, deep)

爬取豆瓣图书名

#coding=utf-8
import urllib.request
import re
y_url="https://read.douban.com/provider/all"
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req=urllib.request.Request(url=y_url,headers=headers) # 这里不能直接使用urlopen 需要加header
data=urllib.request.urlopen(req).read().decode("utf-8") # 这面有个报错需要将编码格式换为utf-8
# print(data)
pat='<div class="name">(.*?)</div>'  # 正则匹配，去掉中间.*?的括号的话，会把div标签也读取到，因此要加上括号
rst=re.compile(pat).findall(data)
fh=open("./1.txt","w")
for i in range(0,len(rst)):
    print(rst[i])
    fh.write(rst[i]+"\n")   #写入文件
fh.close()

python爬虫实战

爬取百度百科1000个页面的数据

思路

python爬虫实战

目标：百度百科Python词条相关词条网页-标题和简介
入口页：http://baike.baidu.com/view/21087.htm
URL格式：
- 词条页面URL：/view/125370.htm
数据格式：
- 标题：
  - <dd class="lemmaWgt-lemmaTitle-title"><h1>***</h1></add>
- 简介：
  - <div class="lemma-summary">***<div>
页面编码：UTF-8