""" a serial toy crawler program which crawls over the web pages from a given seed url indefinitely or upto the desired depth """ __todo__ = """ 1. crawl manage to be made multi thread aware and call crawl workers in configurable number of threads """ # some parts of the logic borrowed from scrapy project # https://github.com/scrapy/scrapy import argparse import urlparse import logging import HTMLParser import re import urllib2 import signal import sys import time from w3lib.url import safe_download_url from robotparser import RobotFileParser _USERAGENT = 'toycrawler' _ACCEPTED_CONTENTTYPES = re.compile('text/(plain|html);?') def register_signal_handlers(handler): # Ctrl C is used to stop the crawl signal.signal(signal.SIGINT, handler) class Utils(object): """a collection of useful utils""" @staticmethod def is_valid_url(url): """checks is the passed in url has one of the schemes http, https or file """ pr = urlparse.urlparse(url) return pr.scheme in ['http', 'https', 'file'] class Logger(object): """ custom logger built on top of logging module """ def __init__(self, filename, debuglog): self.logger = logging.getLogger('toycrawler') if debuglog: self.logger.setLevel(logging.DEBUG) else: self.logger.setLevel(logging.ERROR) loghandle = logging.FileHandler(filename) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') loghandle.setFormatter(formatter) loghandle.setLevel(logging.DEBUG) self.logger.addHandler(loghandle) def dlog(self, log_message): self.logger.debug(log_message) def elog(self, log_message): self.logger.error(log_message) class ImproperSeedError(Exception): def __str__(self): return 'Invalid Seed Format - supported format scheme://netloc:port/path' class DepthExceededError(Exception): def __str__(self): return 'Maximum depth exceeded' class NoIndexMetaTagError(Exception): def __str__(self): return 'Web page has noindex meta tag. Not parsing this page.' class IgnoredExtensions: # borrowed from scrapy/linkextractor.py with some additions IGNORED_EXTENSIONS = set([ # images 'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif', 'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg', # audio 'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff', # video '3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv', 'm4a', # office suites 'xls', 'ppt', 'doc', 'docx', 'odt', 'ods', 'odg', 'odp', # other 'css', 'pdf', 'doc', 'exe', 'bin', 'rss', 'zip', 'rar', 'msi', 'asc', 'dmg', 'bz2', 'ics' ]) @classmethod def url_has_ignored_extension(cls, url): pr = urlparse.urlparse(url) try: ext = pr.path.split('.')[-1] except IndexError: return False return ext in cls.IGNORED_EXTENSIONS class HTMLDownloader(object): """ downloads any valid url returning text/plain or text/html """ def __init__(self, url, logger): # url passed in here is assumed to be safe and valid self.url = url self.log = logger self.data = None self.headers={ 'User-Agent': _USERAGENT, 'Accept': 'text-plain;text-html;' # may not be honored by server ! } def download(self): # do we need retries here ? req = urllib2.Request(self.url, self.data, self.headers) handle = None try: handle = urllib2.urlopen(req) except urllib2.HTTPError as e: self.log.elog('HTTP error in opening url ' + self.url + ' code: ' + str(e.code)) except urllib2.URLError as e: self.log.elog('exception in opening url ' + self.url + ' reason: ' + str(e.reason)) except IOError as e: self.log.elog('IOError exception in opening url ' + self.url + ' exception: ' + str(e)) headers = handle.info() _m = _ACCEPTED_CONTENTTYPES.search(headers.get('Content-type', '')) content_type = _m.group(1) if _m else None if content_type not in ["plain", "html"]: self.log.elog('content-type %s returned is not plain or html. url: %s' % (headers["Content-type"], self.url)) return '' if handle is None: self.log.elog('urlopen returned None handle for url: ' + self.url) return '' # all good so far ! return ''.join(handle.readlines()) class HTMLLinkExtractor(HTMLParser.HTMLParser): """ takes response content from parsing a link, parses it and extracts all the links from it. - if the url is not valid as per Utils.is_valid_url, skip it - if the url has ignored extension, then skip it - if the link tag has nofollow, then skip it - if there is a meta tag with noindex in the page, that page is skipped """ def __init__(self, url, logger): HTMLParser.HTMLParser.__init__(self) self.url = url self.log = logger self.urls = [] self.base_url = None def handle_starttag(self, tag, attrs): if tag == 'a': url = None for attr, value in attrs: if attr == 'href': url = value elif attr == 'rel' and value == 'nofollow': return # this link shouldn't be followed if url is not None: self.urls.append(safe_download_url(url)) if tag == 'base': self.base_url = dict(attrs).get('href') if tag == 'meta': no_index = dict(attrs).get('content', None) if no_index is not None and no_index == 'noindex': raise NoIndexMetaTagError def skip_url(self, url): # check if the passed in url is to be skipped if IgnoredExtensions.url_has_ignored_extension(url): self.log.dlog('url %s has ignored extension. skipping it.' % url) return True if not Utils.is_valid_url(url): self.log.dlog('url %s is not valid url for download. skipping it.' % url) return True return False def extracted_links(self, response_text): self.feed(response_text) self.close() urls = [] base_url = urlparse.urljoin(self.url, self.base_url) if self.base_url else self.url for _u in self.urls: url = safe_download_url(urlparse.urljoin(base_url, _u)) if not self.skip_url(url): urls.append(url) # TODO: review the following set usage return iter(set(urls)) class CrawlWorker(): """ worker logic which gets a unit of work(link) from Manager , download it, parses it and submits units of work(links) to the Manager """ def __init__(self, manager, logger): self.manager = manager self.log = logger def run(self): while True: self.get_work() if self.url is None: return # done # results DepthExceededError when set depth is exceeded self.process_work() def get_work(self): self.url = self.manager.get_url() def get_robotparser(self): # ask the manager for robot parser for this domain # if none found, get one and give it to manager pr = urlparse.urlparse(self.url) domain = pr.scheme + '://' + pr.netloc robotparser = self.manager.get_robotparser(domain) if robotparser is None: robotstxt = domain + '/' + 'robots.txt' robotparser = RobotFileParser(robotstxt) robotparser.read() self.manager.set_robotparser(domain, robotparser) return robotparser def process_work(self): pr = urlparse.urlparse(self.url) if pr.scheme != 'file': robotparser = self.get_robotparser() else: robotparser = None if pr.scheme == 'file' or robotparser.can_fetch(_USERAGENT, self.url): d = HTMLDownloader(self.url, self.log) response = d.download() self.manager.set_visited(self.url) if response != '': self.log.dlog('successfully visited %s' % self.url) self.manager.incr_successful_fetches() try: p = HTMLLinkExtractor(self.url, self.log) try: for u in p.extracted_links(response): if self.manager.not_visited(u): self.manager.add_url(u) else: self.log.dlog('url %s visited already.' % self.url) except NoIndexMetaTagError: self.log.dlog('url %s has meta tag with content=noindex. not parsing it.' % self.url) except HTMLParser.HTMLParseError as e: self.log.elog('parse error in url %s exception %s in line %s' % (self.url, e.msg, e.lineno)) else: self.manager.incr_unsuccessful_fetches() self.log.elog('HTMLDownloader empty string as response for url: %s' % self.url) else: self.log.dlog('url %s skipped as robots.txt in domain has Disallow rule' % self.url) class CrawlManager(object): """ singleton which encapsulates the repository of urls, visited links and logic to handle them - follows FIFO logic for crawling the urls """ def __init__(self, logger): self.urls = [] self.visited_urls = {} self.robotparsers = {} self.log = logger # seconds to sleep before spawning any request self.sleep_interval = 3 self.successful_fetches = 0 self.unsuccessful_fetches = 0 self.depth = -1 # go upto visiting all the urls def add_url(self, url): # implements the logic of depth constraint if (self.depth > 0 and len(self.visited_urls) >= self.depth): self.log.dlog('not queuing any more urls as depth exceeded.') raise DepthExceededError if self.not_visited(url): self.urls.append(url) def get_url(self): try: return self.urls.pop(0) except IndexError: return None def set_visited(self, url): self.visited_urls[url] = 1 def not_visited(self, url): return self.visited_urls.get(url, None) is None def visited(self, url): return self.visited_urls.get(url, None) is not None def set_robotparser(self, domain, robotparser): if self.robotparsers.get(domain, None) is None: self.robotparsers[domain] = robotparser else: self.log.elog('attempt to override a robotparser made.') def get_robotparser(self, domain): return self.robotparsers.get(domain, None) def incr_successful_fetches(self): self.successful_fetches += 1 def incr_unsuccessful_fetches(self): self.unsuccessful_fetches += 1 def start(self, depth): # if depth = -1, continue crawling till interrupted or no more urls # else go upto the depth and return self.depth = depth self.log.dlog('started crawling') try: # start calling crawl workers to do the work cw = CrawlWorker(self, self.log) cw.run() except DepthExceededError: pass self.log.dlog('stopped crawling') def stats(self): print '#successful fetches: ', self.successful_fetches print '#unsuccessful fetches: ', self.unsuccessful_fetches class ToyCrawler(object): def __init__(self, seed, depth, debuglog): self.seed = seed self.depth = depth self.log = Logger('toycrawler.log', debuglog) self.crawl_manager = CrawlManager(logger=self.log) def _sanitytest(self): # sanity test the seed if not Utils.is_valid_url(self.seed): raise ImproperSeedError def start(self): """ start crawling """ self._sanitytest() self.crawl_manager.add_url(self.seed) self.crawl_manager.start(depth=self.depth) self.crawl_manager.stats() # this makes ToyCrawler as callable to be used as interrput handler, # print stats and exit cleanly def __call__(self, signum, frame): print 'shutdown requested. stopping crawling.' self.log.dlog('stopped crawling') self.crawl_manager.stats() sys.exit(0) def parse_command_line(): """ sets up the required command line arguments parsing logic returns: ArgumentParser object """ args_parser = argparse.ArgumentParser( description='Toy Crawler - that just crawls webpages from the given seed url.', epilog='Stop with Keyboard interrupt (Ctrl-C) if you want stop crawling in between.') args_parser.add_argument('--seed', dest='seed', required=True, help='seed url to start the crawling with. format: scheme://netloc:port/path') args_parser.add_argument('--depth', dest='depth', default=-1, help='number of links to crawl up to. if 100, first 100 links from seed url are crawled.') args_parser.add_argument('--debug', dest='debuglog', default=0, help='enable debug logs? pass 1 if yes.', choices=['1']) return args_parser.parse_args() def main(): args = parse_command_line() t = ToyCrawler(args.seed, int(args.depth), int(args.debuglog)) register_signal_handlers(t) t.start() sys.exit(0) if __name__ == '__main__': main()
Yet another blog from yet another software engineer - a collection of my thoughts and some snippets of code I write (mainly for my later reference). If you find this useful, lets discuss in comments.
February 25, 2014
A toy crawler - crawls all the web pages from the given url
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment