多线程爬取糗事网python3
1.导入模块:
使用到了多线程这里使用queue进行数据交互
2.创建爬取页面的类
3.创建处理数据的类
4.创建调用函数
CRAM_EXIT = False
PARSE_EXIT = False
def main():
pageQueue = Queue(20)
for i in range(1, 21):
pageQueue.put(i)
# 采集结果
dataQueue = Queue()
filename = open("E://file/qiushi2.json", "a")
# 创建锁
lock = threading.Lock()
# 三个采集线程的名字
crawList = ['线程1号', '线程2号', '线程3号']
threadcrawl = []
# 存储三个采集线程的名字
for threadName in crawList:
thread = ThreadCrawl(threadName, pageQueue, dataQueue)
thread.start()
threadcrawl.append(thread)
# 三个解析线程的名字
parseList = ["解析线程1号", "解析线程2号", "解析线程3号"]
# 存储三个解析线程
threadparse = []
for threadName in parseList:
thread = ThreadParse(threadName, dataQueue, filename, lock)
thread.start()
threadparse.append(thread)
while not pageQueue.empty():
pass
global CRAM_EXIT
CRAM_EXIT = True
print('pageQueue为空')
for thread in threadcrawl:
thread.join()
print('1')
while not dataQueue.empty():
pass
global PARSE_EXIT
PARSE_EXIT = True
for thread in threadparse:
thread.join()
print('2')
with lock:
# 关闭文件
filename.close()
print("谢谢使用!")
if __name__ == "__main__":
main()