In [1]:
import streamz
from requests_html import HTMLSession
session = HTMLSession()
def get_response(url):
global session
return session.get(url)
def get_result(response):
return response.html.search('<title>{}</title>'),response.url
def get_links(response):
return response.html.absolute_links
def is_special_url(url):
return 'gndy' in url
def is_special_response(response):
return 'gndy' in response.url
source = streamz.Stream()
pages = source.unique()
response = pages.rate_limit(1).map(get_response)
special_response = response.filter(is_special_response)
result = special_response.map(get_result)
links = response.map(get_links).flatten()# list to stream
links.sink(source.emit) # pipe new links back into pages
result.sink(print)
In [2]:
source.visualize()
Out[2]:
In [3]:
source.emit('http://www.dytt8.net')