使用流快速构建爬虫

In [1]:
import streamz

from requests_html import HTMLSession
session = HTMLSession()

def get_response(url):
    global session
    return session.get(url)

def get_result(response):
    return response.html.search('<title>{}</title>'),response.url

def get_links(response):
    return response.html.absolute_links

def is_special_url(url):
    return 'gndy' in url

def is_special_response(response):
    return 'gndy' in …
more ...