Getting started with python crawlers (four) using multi-threaded crawlers

Getting started with python crawlers (four) using multi-threaded crawlers

 Multi-threaded crawler

 First review some of the knowledge learned before

1. A cpu can only perform one task at a time, and multiple cpus can perform multiple tasks at the same time. 2. A cpu can only execute one process at a time, and other processes are in a non-running state. 3. The execution unit contained in the process is called a thread, a process It can contain multiple threads. 4. The memory space of a process is shared, and threads in each process can use the shared space. 5. When a thread uses the shared space, other threads must wait (blocking state) 6 The role of the mutex is to prevent multiple threads from using this memory space at the same time. The thread used first will lock the space while other threads are in a waiting state. Wait until the lock is unlocked to enter 7. Process: represents one execution of the program 8. Thread: the basic scheduling unit of CPU operations 9. GIL (global lock): the execution pass in python, and there is only one. The thread that gets the pass can enter the CPU to perform tasks. Threads without GIL can not perform tasks 10. Python's multi-threading is suitable for a large number of intensive I/O processing 11. Python's multi-process is suitable for a large number of intensive parallel calculations

 Multi-threaded crawling of embarrassment encyclopedia

#!/usr/bin/env python
# -*- coding:utf-8 -*-

# Thread library is used
import threading
# Queue
from Queue import Queue
# Parsing library
from lxml import etree
# Request processing
import requests
# json processing
import json
import time

class ThreadCrawl(threading.Thread):
    def __init__(self, threadName, pageQueue, dataQueue):
        #threading.Thread.__init__(self)
        # Call the parent class initialization method
        super(ThreadCrawl, self).__init__()
        # Thread name
        self.threadName = threadName
        # Page number queue
        self.pageQueue = pageQueue
        # Data queue
        self.dataQueue = dataQueue
        # Request header
        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}

    def run(self):
        print "Start" + self.threadName
        while not CRAWL_EXIT:
            try:
                # Take out a number, first in first out
                # Optional parameter block, the default value is True
                #1. If the column is empty and the block is True, it will not end and will enter the blocking state until there is new data in the queue
                #2. If the queue is empty and the block is False, a Queue.empty() exception will pop up.
                page = self.pageQueue.get(False)
                url = "http://www.qiushibaike.com/8hr/page/" + str(page) +"/"
                #print url
                content = requests.get(url, headers = self.headers).text
                time.sleep(1)
                self.dataQueue.put(content)
                #print len(content)
            except:
                pass
        print "end" + self.threadName

class ThreadParse(threading.Thread):
    def __init__(self, threadName, dataQueue, filename, lock):
        super(ThreadParse, self).__init__()
        # Thread name
        self.threadName = threadName
        # Data queue
        self.dataQueue = dataQueue
        # Save the file name of the parsed data
        self.filename = filename
        # Lock
        self.lock = lock

    def run(self):
        print "Start" + self.threadName
        while not PARSE_EXIT:
            try:
                html = self.dataQueue.get(False)
                self.parse(html)
            except:
                pass
        print "Exit" + self.threadName

    def parse(self, html):
        # Parse as HTML DOM
        html = etree.HTML(html)

        node_list = html.xpath('//div[contains(@id, "qiushi_tag")]')

        for node in node_list:
            # The list returned by xpath, this list is just one parameter, extracted by index, user name
            username = node.xpath('./div/a/@title')[0]
            # Picture link
            image = node.xpath('.//div[@class="thumb"]//@src')#[0]
            # Take out the content under the label, paragraph content
            content = node.xpath('.//div[@class="content"]/span')[0].text
            # Take out the content contained in the label and like it
            zan = node.xpath('.//i')[0].text
            # Comment
            comments = node.xpath('.//i')[1].text

            items = {
                "username": username,
                "image": image,
                "content": content,
                "zan": zan,
                "comments": comments
            }

            There are two operations that must be performed after # with: __enter__ and _exit__
            # Regardless of the result of the operation inside, it will open and close
            # Open the lock, process the content, release the lock
            with self.lock:
                # Write stored parsed data
                self.filename.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n")

CRAWL_EXIT = False
PARSE_EXIT = False


def main():
    # Queue of page numbers, which means 20 pages
    pageQueue = Queue(20)
    # Put in numbers from 1 to 10, first in first out
    for i in range(1, 21):
        pageQueue.put(i)

    # The data queue of the collected results (HTML source code of each page), if the parameter is empty, it means no limit
    dataQueue = Queue()

    filename = open("duanzi.json", "a")
    # Create lock
    lock = threading.Lock()

    # The names of the three collection threads
    crawlList = ["Collection Thread No. 1", "Collection Thread No. 2", "Collection Thread No. 3"]
    # Store a collection of lists of three collection threads
    threadcrawl = []
    for threadName in crawlList:
        thread = ThreadCrawl(threadName, pageQueue, dataQueue)
        thread.start()
        threadcrawl.append(thread)


    # The names of the three parsing threads
    parseList = ["Analysis thread number 1","Analysis thread number 2","Analysis thread number 3"]
    # Store three parsing threads
    threadparse = []
    for threadName in parseList:
        thread = ThreadParse(threadName, dataQueue, filename, lock)
        thread.start()
        threadparse.append(thread)

    # Waiting for the pageQueue queue to be empty, that is, waiting for the completion of the previous operation
    while not pageQueue.empty():
        pass

    # If pageQueue is empty, the collection thread exits the loop
    global CRAWL_EXIT
    CRAWL_EXIT = True

    print "pageQueue is empty"

    for thread in threadcrawl:
        thread.join()
        print "1"

    while not dataQueue.empty():
        pass

    global PARSE_EXIT
    PARSE_EXIT = True

    for thread in threadparse:
        thread.join()
        print "2"

    with lock:
        # Close file
        filename.close()
    print "Thank you for using!"

if __name__ == "__main__":
    main()
Reference: https://cloud.tencent.com/developer/article/1091702 Getting started with python crawlers (four) using multi-threaded crawlers-Cloud + Community-Tencent Cloud