import re defcheckPwd(pwd): compilePwd=re.compile('^.*(?=.*[0-9])(?=.*[A-Z)(?=.*[a-z)\w{8,}') if compilePwd.match(pwd): returnTrue else: returnFalse
读取网站名称并存入表格
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# coding=utf-8
import re import requests
withopen('test.txt','r',encoding='gbk') as f: try: for line in f.readlines(): req=requests.get(line) res=req.text pat=r"<title>(.*?)</title>" data=re.findall(pat,res) print(data) ifnot data isNone: withopen('url.csv','a',encoding='gbk') as f: f.write(str(data)+'\n'.strip()) f.write(","+line+"\n".strip()) except Exception as e: print(e)
defgetHttpStatusCode(url): try: request = requests.get(url,timeout=0.5) httpStatusCode = request.status_code return httpStatusCode except requests.exceptions.HTTPError as e: return e
if __name__ == "__main__": withopen('test.txt', 'r',encoding="gbk") as f: for line in f: url="http://"+line try: status = getHttpStatusCode(url.strip('\n')) # 换行符 print(status) if status == 200: withopen('200url.txt', 'a',encoding="gbk") as f: f.write(url+'\n'.strip()) print(url) else: print('no 200 code') except Exception as e: print(e)
lines_seen = set() # set方法用来删除文件中重复的行 withopen('test.txt', 'r') as f1: withopen('result.txt', 'w') as f2: for line in f1.readlines(): url = line.strip() if url notin lines_seen: # print url f2.write(url + '\n') lines_seen.add(url)
此脚本删除test.txt中重复的数据,然后将不重复的数据保存到result.txt中去
筛选两个文件中不同的数据
1 2 3 4 5 6 7
withopen('test1.txt','r') as f1: withopen('test2.txt','r') as f2: file1=f1.readlines() # 需要注意的地方 file2=f2.readlines() for i in file1: if i notin file2: print(i)