多线程爬虫的使用
# 采用多线程来爬取微信的数据# 创建队列,保存获取每页的链接urlqueue = queue.Queue()# 模拟浏览器访问headers = ('User-Agent', 'Moz+illa/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36')opener = urllib.request.build_opener()opener.addheaders = [headers]urllib.request.install_opener(opener)
# 使用代理服务器访问网址def use_proxy(proxy_addr, url): try: proxy = urllib.request.HTTPHandler({'http': proxy_addr}) opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler) urllib.request.install_opener(opener) data = urllib.request.urlopen(url).read().encode('utf-8') return data except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) time.sleep(10) except Exception as e: print("Exception:"+str(e)) time.sleep(1)
# 使用线程1来获取每页的url,并且把他转换成真实场景的链接,存放在队列中class getUrl(threading.Thread): def __init__(self, keywords, pagestart, pageend, proxy): self.threading.Thread.__init__(self) self.keywords = keywords self.pagestart = pagestart self.pageend = pageend self.proxy = proxy def run(self): try: # 保存获取的url listUrls = [] # 编码 keywords_encode = urllib.request.quote(self.keywords) # &page编码 page_encode = urllib.request.quote("&page") for page in range(self.pagestart, self.pageend): url = "http://weixin.sogou.com/weixin?type=2&query="+keywords_encode+page_encode+str(page) data = use_proxy(self.proxy, url) pattrn = '
# 线程2获取链接的文章和内容class getContent(threading.Thread): def __init__(self, urlqueue, proxy): self.threading.Thread.__init__(self) self.urlqueue = urlqueue self.proxy = proxy def run(self): # 网页的头部 html1 ='''微信文章 ''' data = open("D:/python/file/9.html", "wb") data.write(html1.encode('utf-8')) data.close() try: x = 1 while True: url = urlqueue.get() title_pattrn = '(.*?) ' title = re.compile(title_pattrn, re.S).findall(articelData) content_pattrn = 'id="js_content">(.*?)id="js_sg_bar"' content = re.compile(content_pattrn, re.S).findall(articelData) articelTitle = "没有获取到标题" articelContent = "没有获取到内容" if title != []: articelTitle = title[0] if content != []: articelContent = content[0] detail = "标题为:"+articelTitle+"
内容为:"+articelContent+"
" data = open("D:/python/file/9.html", "ab") data.write(detail.encode('utf-8')) data.close() print("第"+str(x)+"次处理数据") x += 1 html2 = ''' ''' data = open("D:/python/file/9.html", "ab") data.write(html2.encode('utf-8')) data.close() except urllib.request.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) time.sleep(10) except Exception as e: print("Exception:"+str(e)) time.sleep(1)
# 线程3控制程序执行class cotrol(threading.Thread): def __init__(self, urlqueue): self.threading.Thread.__init__(self) self.urlqueue = urlqueue def run(self): while True: print("程序正在执行中...") time.sleep(60) if (self.urlqueue.empty()): print("程序执行完毕!") exit()
keywords = "物联网"pagestart = 1pageend = 3proxy = "123.15.156:8089"listUrl = getUrl()listUrl.start()content = getContent()content.start()control = cotrol()control.start()