"""
a serial toy crawler program which crawls over the web pages from a given
seed url indefinitely or upto the desired depth
"""
__todo__ = """
1. crawl manage to be made multi thread aware and call crawl workers in configurable
number of threads
"""
# some parts of the logic borrowed from scrapy project
# https://github.com/scrapy/scrapy
import argparse
import urlparse
import logging
import HTMLParser
import re
import urllib2
import signal
import sys
import time
from w3lib.url import safe_download_url
from robotparser import RobotFileParser
_USERAGENT = 'toycrawler'
_ACCEPTED_CONTENTTYPES = re.compile('text/(plain|html);?')
def register_signal_handlers(handler):
# Ctrl C is used to stop the crawl
signal.signal(signal.SIGINT, handler)
class Utils(object):
"""a collection of useful utils"""
@staticmethod
def is_valid_url(url):
"""checks is the passed in url has one of the schemes http, https or file """
pr = urlparse.urlparse(url)
return pr.scheme in ['http', 'https', 'file']
class Logger(object):
""" custom logger built on top of logging module
"""
def __init__(self, filename, debuglog):
self.logger = logging.getLogger('toycrawler')
if debuglog:
self.logger.setLevel(logging.DEBUG)
else:
self.logger.setLevel(logging.ERROR)
loghandle = logging.FileHandler(filename)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
loghandle.setFormatter(formatter)
loghandle.setLevel(logging.DEBUG)
self.logger.addHandler(loghandle)
def dlog(self, log_message):
self.logger.debug(log_message)
def elog(self, log_message):
self.logger.error(log_message)
class ImproperSeedError(Exception):
def __str__(self):
return 'Invalid Seed Format - supported format scheme://netloc:port/path'
class DepthExceededError(Exception):
def __str__(self):
return 'Maximum depth exceeded'
class NoIndexMetaTagError(Exception):
def __str__(self):
return 'Web page has noindex meta tag. Not parsing this page.'
class IgnoredExtensions:
# borrowed from scrapy/linkextractor.py with some additions
IGNORED_EXTENSIONS = set([
# images
'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg',
# audio
'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',
# video
'3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv',
'm4a',
# office suites
'xls', 'ppt', 'doc', 'docx', 'odt', 'ods', 'odg', 'odp',
# other
'css', 'pdf', 'doc', 'exe', 'bin', 'rss', 'zip', 'rar', 'msi', 'asc',
'dmg', 'bz2', 'ics'
])
@classmethod
def url_has_ignored_extension(cls, url):
pr = urlparse.urlparse(url)
try:
ext = pr.path.split('.')[-1]
except IndexError:
return False
return ext in cls.IGNORED_EXTENSIONS
class HTMLDownloader(object):
""" downloads any valid url returning text/plain or text/html
"""
def __init__(self, url, logger):
# url passed in here is assumed to be safe and valid
self.url = url
self.log = logger
self.data = None
self.headers={
'User-Agent': _USERAGENT,
'Accept': 'text-plain;text-html;' # may not be honored by server !
}
def download(self):
# do we need retries here ?
req = urllib2.Request(self.url, self.data, self.headers)
handle = None
try:
handle = urllib2.urlopen(req)
except urllib2.HTTPError as e:
self.log.elog('HTTP error in opening url ' + self.url + ' code: ' + str(e.code))
except urllib2.URLError as e:
self.log.elog('exception in opening url ' + self.url + ' reason: ' + str(e.reason))
except IOError as e:
self.log.elog('IOError exception in opening url ' + self.url + ' exception: ' + str(e))
headers = handle.info()
_m = _ACCEPTED_CONTENTTYPES.search(headers.get('Content-type', ''))
content_type = _m.group(1) if _m else None
if content_type not in ["plain", "html"]:
self.log.elog('content-type %s returned is not plain or html. url: %s' % (headers["Content-type"], self.url))
return ''
if handle is None:
self.log.elog('urlopen returned None handle for url: ' + self.url)
return ''
# all good so far !
return ''.join(handle.readlines())
class HTMLLinkExtractor(HTMLParser.HTMLParser):
""" takes response content from parsing a link, parses it and extracts
all the links from it.
- if the url is not valid as per Utils.is_valid_url, skip it
- if the url has ignored extension, then skip it
- if the link tag has nofollow, then skip it
- if there is a meta tag with noindex in the page, that page is skipped
"""
def __init__(self, url, logger):
HTMLParser.HTMLParser.__init__(self)
self.url = url
self.log = logger
self.urls = []
self.base_url = None
def handle_starttag(self, tag, attrs):
if tag == 'a':
url = None
for attr, value in attrs:
if attr == 'href':
url = value
elif attr == 'rel' and value == 'nofollow':
return # this link shouldn't be followed
if url is not None:
self.urls.append(safe_download_url(url))
if tag == 'base':
self.base_url = dict(attrs).get('href')
if tag == 'meta':
no_index = dict(attrs).get('content', None)
if no_index is not None and no_index == 'noindex':
raise NoIndexMetaTagError
def skip_url(self, url):
# check if the passed in url is to be skipped
if IgnoredExtensions.url_has_ignored_extension(url):
self.log.dlog('url %s has ignored extension. skipping it.' % url)
return True
if not Utils.is_valid_url(url):
self.log.dlog('url %s is not valid url for download. skipping it.' % url)
return True
return False
def extracted_links(self, response_text):
self.feed(response_text)
self.close()
urls = []
base_url = urlparse.urljoin(self.url, self.base_url) if self.base_url else self.url
for _u in self.urls:
url = safe_download_url(urlparse.urljoin(base_url, _u))
if not self.skip_url(url):
urls.append(url)
# TODO: review the following set usage
return iter(set(urls))
class CrawlWorker():
""" worker logic which gets a unit of work(link) from
Manager , download it, parses it and submits units of work(links)
to the Manager
"""
def __init__(self, manager, logger):
self.manager = manager
self.log = logger
def run(self):
while True:
self.get_work()
if self.url is None:
return # done
# results DepthExceededError when set depth is exceeded
self.process_work()
def get_work(self):
self.url = self.manager.get_url()
def get_robotparser(self):
# ask the manager for robot parser for this domain
# if none found, get one and give it to manager
pr = urlparse.urlparse(self.url)
domain = pr.scheme + '://' + pr.netloc
robotparser = self.manager.get_robotparser(domain)
if robotparser is None:
robotstxt = domain + '/' + 'robots.txt'
robotparser = RobotFileParser(robotstxt)
robotparser.read()
self.manager.set_robotparser(domain, robotparser)
return robotparser
def process_work(self):
pr = urlparse.urlparse(self.url)
if pr.scheme != 'file':
robotparser = self.get_robotparser()
else:
robotparser = None
if pr.scheme == 'file' or robotparser.can_fetch(_USERAGENT, self.url):
d = HTMLDownloader(self.url, self.log)
response = d.download()
self.manager.set_visited(self.url)
if response != '':
self.log.dlog('successfully visited %s' % self.url)
self.manager.incr_successful_fetches()
try:
p = HTMLLinkExtractor(self.url, self.log)
try:
for u in p.extracted_links(response):
if self.manager.not_visited(u):
self.manager.add_url(u)
else:
self.log.dlog('url %s visited already.' % self.url)
except NoIndexMetaTagError:
self.log.dlog('url %s has meta tag with content=noindex. not parsing it.' % self.url)
except HTMLParser.HTMLParseError as e:
self.log.elog('parse error in url %s exception %s in line %s' % (self.url, e.msg, e.lineno))
else:
self.manager.incr_unsuccessful_fetches()
self.log.elog('HTMLDownloader empty string as response for url: %s' % self.url)
else:
self.log.dlog('url %s skipped as robots.txt in domain has Disallow rule' % self.url)
class CrawlManager(object):
""" singleton which encapsulates the repository of urls, visited links
and logic to handle them
- follows FIFO logic for crawling the urls
"""
def __init__(self, logger):
self.urls = []
self.visited_urls = {}
self.robotparsers = {}
self.log = logger
# seconds to sleep before spawning any request
self.sleep_interval = 3
self.successful_fetches = 0
self.unsuccessful_fetches = 0
self.depth = -1 # go upto visiting all the urls
def add_url(self, url):
# implements the logic of depth constraint
if (self.depth > 0 and len(self.visited_urls) >= self.depth):
self.log.dlog('not queuing any more urls as depth exceeded.')
raise DepthExceededError
if self.not_visited(url):
self.urls.append(url)
def get_url(self):
try:
return self.urls.pop(0)
except IndexError:
return None
def set_visited(self, url):
self.visited_urls[url] = 1
def not_visited(self, url):
return self.visited_urls.get(url, None) is None
def visited(self, url):
return self.visited_urls.get(url, None) is not None
def set_robotparser(self, domain, robotparser):
if self.robotparsers.get(domain, None) is None:
self.robotparsers[domain] = robotparser
else:
self.log.elog('attempt to override a robotparser made.')
def get_robotparser(self, domain):
return self.robotparsers.get(domain, None)
def incr_successful_fetches(self):
self.successful_fetches += 1
def incr_unsuccessful_fetches(self):
self.unsuccessful_fetches += 1
def start(self, depth):
# if depth = -1, continue crawling till interrupted or no more urls
# else go upto the depth and return
self.depth = depth
self.log.dlog('started crawling')
try:
# start calling crawl workers to do the work
cw = CrawlWorker(self, self.log)
cw.run()
except DepthExceededError:
pass
self.log.dlog('stopped crawling')
def stats(self):
print '#successful fetches: ', self.successful_fetches
print '#unsuccessful fetches: ', self.unsuccessful_fetches
class ToyCrawler(object):
def __init__(self, seed, depth, debuglog):
self.seed = seed
self.depth = depth
self.log = Logger('toycrawler.log', debuglog)
self.crawl_manager = CrawlManager(logger=self.log)
def _sanitytest(self):
# sanity test the seed
if not Utils.is_valid_url(self.seed):
raise ImproperSeedError
def start(self):
""" start crawling
"""
self._sanitytest()
self.crawl_manager.add_url(self.seed)
self.crawl_manager.start(depth=self.depth)
self.crawl_manager.stats()
# this makes ToyCrawler as callable to be used as interrput handler,
# print stats and exit cleanly
def __call__(self, signum, frame):
print 'shutdown requested. stopping crawling.'
self.log.dlog('stopped crawling')
self.crawl_manager.stats()
sys.exit(0)
def parse_command_line():
""" sets up the required command line arguments parsing logic
returns: ArgumentParser object
"""
args_parser = argparse.ArgumentParser(
description='Toy Crawler - that just crawls webpages from the given seed url.',
epilog='Stop with Keyboard interrupt (Ctrl-C) if you want stop crawling in between.')
args_parser.add_argument('--seed', dest='seed', required=True, help='seed url to start the crawling with. format: scheme://netloc:port/path')
args_parser.add_argument('--depth', dest='depth', default=-1, help='number of links to crawl up to. if 100, first 100 links from seed url are crawled.')
args_parser.add_argument('--debug', dest='debuglog', default=0, help='enable debug logs? pass 1 if yes.', choices=['1'])
return args_parser.parse_args()
def main():
args = parse_command_line()
t = ToyCrawler(args.seed, int(args.depth), int(args.debuglog))
register_signal_handlers(t)
t.start()
sys.exit(0)
if __name__ == '__main__':
main()
Yet another blog from yet another software engineer - a collection of my thoughts and some snippets of code I write (mainly for my later reference). If you find this useful, lets discuss in comments.
February 25, 2014
A toy crawler - crawls all the web pages from the given url
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment