1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
| ''' 抓取百度贴吧---生活大爆炸吧的基本内容 爬虫线路: requests - bs4 Python版本: 3.6 ''' import requests import time from bs4 import BeautifulSoup
def get_html(url): try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = 'utf-8' return r.text except: return " ERROR "
def get_content(url): ''' 分析贴吧的网页文件,整理信息,保存在列表变量中 '''
comments = [] html = get_html(url)
soup = BeautifulSoup(html, 'lxml')
liTags = soup.find_all('li', attrs={'class': ' j_thread_list clearfix'})
for li in liTags: comment = {} try: comment['title'] = li.find( 'a', attrs={'class': 'j_th_tit '}).text.strip() comment['link'] = "http://tieba.baidu.com/" + \ li.find('a', attrs={'class': 'j_th_tit '})['href'] comment['name'] = li.find( 'span', attrs={'class': 'tb_icon_author '}).text.strip() comment['time'] = li.find( 'span', attrs={'class': 'pull-right is_show_create_time'}).text.strip() comment['replyNum'] = li.find( 'span', attrs={'class': 'threadlist_rep_num center_text'}).text.strip() comments.append(comment) except: print('出了点小问题')
return comments
def Out2File(dict): ''' 将爬取到的文件写入到本地 保存到当前目录的 TTBT.txt文件中。
''' with open('TTBT.txt', 'a+') as f: for comment in dict: f.write('标题: {} \t 链接:{} \t 发帖人:{} \t 发帖时间:{} \t 回复数量: {} \n'.format( comment['title'], comment['link'], comment['name'], comment['time'], comment['replyNum']))
print('当前页面爬取完成')
def main(base_url, deep): url_list = [] for i in range(0, deep): url_list.append(base_url + '&pn=' + str(50 * i)) print('所有的网页已经下载到本地! 开始筛选信息。。。。')
for url in url_list: content = get_content(url) Out2File(content) print('所有的信息都已经保存完毕!')
base_url = 'http://tieba.baidu.com/f?kw=%E7%94%9F%E6%B4%BB%E5%A4%A7%E7%88%86%E7%82%B8&ie=utf-8'
deep = 3 if __name__ == '__main__': main(base_url, deep)
|