python作为爬虫的最常见语言,很有自己的优势。这里举一些常见的用法。
1,使用scrapy框架。
https://scrapy-chs.readthedocs.io/zh_CN/0.24/intro/tutorial.html
2, 纯脚本
lib库
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
import Queue, time, random
import requests
提供一个比较粗糙的代码,使用到了python代理,queue,多线程,BeautifulSoup。最终print方法应该用线程锁避免打印错误。
#coding=utf-8 import requests from concurrent.futures import ThreadPoolExecutor import Queue, time, random #import pymysql from bs4 import BeautifulSoup import re import urllib import urllib2 import gzip import cStringIO import datetime import json from StringIO import StringIO import threading import base64 index_url = 'https://www.juzimi.com/alltags' page_url = 'https://www.juzimi.com/alltags?page=%s' task_queue = Queue.Queue() has_words = [] has_words_value = {} lock=threading.Lock() ip = "" def getIp() : global ip url = 'http://s.zdaye.com/?api=201903221353043521&count=1&px=2' ipret = curl(url, '', False, False) time.sleep(5) print "get ip:" + str(ipret) ip = str(ipret) def curl(url, data = '', isCompress = False, use_ip = True): global ip if(data): data = urllib.urlencode(data) else: data = None #headers = {"method":"GET","user-agent":self.ua,"Referer":self.refer, "Cookie":self.cookie, "Upgrade-Insecure-Requests": 1,"Accept-Encoding": "gzip, deflate, br"} headers = {"method":"GET","Accept-Encoding": "gzip, deflate, br", "user-agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"} try : #代理代码 if(use_ip) : opener = urllib2.build_opener(urllib2.ProxyHandler({"https" : ip})) urllib2.install_opener(opener) request = urllib2.Request(url, data, headers) response = urllib2.urlopen(request, timeout=10) #response = opener.open(request) if(isCompress) : buf = StringIO(response.read()) data = gzip.GzipFile(fileobj=buf) return data.read() return response.read() except : exit() getIp() print "get ip retry" self.curl(url, data, isCompress, use_ip) #exit() def setTaskR() : task_queue.put({'url':'https://www.juzimi.com/tags/%E6%96%B0%E8%AF%97', 'title':'诗词'}) def setTask() : #for i in range(0, 12) : for i in range(0, 12) : url = page_url % (i) content = curl(url, '', True) #print url soup = BeautifulSoup(content, "html.parser") span = soup.findAll('div',{'class':'views-field-name'}) for tmp in span : data = {} href = tmp.find('a').get('href') title = tmp.find('a').get('title') data = {"url":"https://www.juzimi.com" + href, "title" : title} print data task_queue.put(data) #print tmp.get('title') time.sleep(1) def getFile() : global has_words global has_words_value for line in open('word.log') : if(line.find('juzimi.com') != -1) : continue line = line.split(":", 1) if(len(line) > 1 and line[1] == 'test') : continue #print line[0] if(not line[0] in has_words) : has_words.append(line[0]) has_words_value[line[0]] = 1 #print line[0] else : has_words_value[line[0]] = has_words_value[line[0]] + 1 has_words = [] for k in has_words_value: if(has_words_value[k] > 100) : has_words.append(k) for line in open('word.url') : lines = eval(line) url = lines['url'] title = lines['title'].encode('utf-8') if(title in has_words) : continue task_queue.put(lines) #runTask() sleep_time = random.randint(30,60) #time.sleep(sleep_time) #time.sleep(60) def runTask() : while(not task_queue.empty()) : data = task_queue.get() printinfo =[] hotword = data['title'] url = data['url'] hotword = hotword.encode('utf-8') #print url lastIndex = 0 content = curl(url, '', True) #content = re.sub(r'\n| |\xa0|\\xa0|\u3000|\\u3000|\\u0020|\u0020', '', str(content)) content = content.replace('<br/>', '') content = content.replace('\r', ' ') soup = BeautifulSoup(content, "html.parser") last = soup.find('li', {'class', 'pager-last'}) if (not last) : last = soup.findAll('li', {'class' : 'pager-item'}) if(not last) : print "get empty:" + url continue for tmp in last : if(int(tmp.text) > lastIndex) : #print int(tmp.text) lastIndex = int(tmp.text) else : lastIndex = last.text span = soup.findAll('div',{'class', 'views-field-phpcode-1'}) if(not span) : print "get empty:" + url continue #print url for tmp in span : words = tmp.findAll('a') for word in words : #printinfo.append({'hotword' : hotword, 'content' : word.text.encode('utf-8')}) print hotword + ":" + word.text.encode('utf-8') #time.sleep(3) sleep_time = random.randint(10,20) #time.sleep(sleep_time) for i in range(1, int(lastIndex)) : url = "https://www.juzimi.com/tags/" +hotword + "?page=" + str(i) #ret = getContent(url, hotword) t = threading.Thread(target=getContent, args=(url, hotword)) t.start() """ for tmp in ret: printinfo.append(tmp) """ """ for tmp in printinfo : print tmp['hotword'] + ":" + tmp['content'] """ def getContent(url, hotword) : printinfo =[] #print url content = curl(url, '', True) #content = re.sub(r'\n| |\xa0|\\xa0|\u3000|\\u3000|\\u0020|\u0020', '', str(content)) content = content.replace('<br/>', '') content = content.replace('\r', ' ') soup = BeautifulSoup(content, "html.parser") last = soup.find('li', {'class', 'pager-last'}) span = soup.findAll('div',{'class', 'views-field-phpcode-1'}) for tmp in span : words = tmp.findAll('a') for word in words : #printinfo.append({'hotword' : hotword, 'content' : word.text.encode('utf-8')}) print hotword + ":" + word.text.encode('utf-8') sleep_time = random.randint(20,30) #time.sleep(sleep_time) #return printinfo getIp() getFile() """ #setTaskR() #runTask() #exit() """ executor = ThreadPoolExecutor(max_workers = 50) #executor.submit(runTask) #exit() for i in range(0, 20) : executor.submit(runTask)