Pages

Search This Blog

February 25, 2014

A toy crawler - crawls all the web pages from the given url

"""
a serial toy crawler program which crawls over the web pages from a given
seed url indefinitely or upto the desired depth
"""

__todo__ = """
1. crawl manage to be made multi thread aware and call crawl workers in configurable
number of threads
"""

# some parts of the logic borrowed from scrapy project
# https://github.com/scrapy/scrapy

import argparse
import urlparse
import logging
import HTMLParser
import re
import urllib2
import signal
import sys
import time

from w3lib.url import safe_download_url
from robotparser import RobotFileParser

_USERAGENT = 'toycrawler'
_ACCEPTED_CONTENTTYPES = re.compile('text/(plain|html);?')

def register_signal_handlers(handler):
    # Ctrl C is used to stop the crawl
    signal.signal(signal.SIGINT, handler)

class Utils(object):
    """a collection of useful utils"""

    @staticmethod
    def is_valid_url(url):
        """checks is the passed in url has one of the schemes http, https or file """
        pr = urlparse.urlparse(url)
        return pr.scheme in ['http', 'https', 'file']

class Logger(object):
    """ custom logger built on top of logging module
    """

    def __init__(self, filename, debuglog):
        self.logger = logging.getLogger('toycrawler')

        if debuglog:
            self.logger.setLevel(logging.DEBUG)
        else:
            self.logger.setLevel(logging.ERROR)
            
        loghandle = logging.FileHandler(filename)
        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        loghandle.setFormatter(formatter)
        loghandle.setLevel(logging.DEBUG)
        self.logger.addHandler(loghandle)

    def dlog(self, log_message):
        self.logger.debug(log_message)

    def elog(self, log_message):
        self.logger.error(log_message)

class ImproperSeedError(Exception):

    def __str__(self):
        return 'Invalid Seed Format - supported format scheme://netloc:port/path'

class DepthExceededError(Exception):

    def __str__(self):
        return 'Maximum depth exceeded'

class NoIndexMetaTagError(Exception):

    def __str__(self):
        return 'Web page has noindex meta tag. Not parsing this page.'

class IgnoredExtensions:
    # borrowed from scrapy/linkextractor.py with some additions
    IGNORED_EXTENSIONS = set([
        # images
        'mng', 'pct', 'bmp', 'gif', 'jpg', 'jpeg', 'png', 'pst', 'psp', 'tif',
        'tiff', 'ai', 'drw', 'dxf', 'eps', 'ps', 'svg',

        # audio
        'mp3', 'wma', 'ogg', 'wav', 'ra', 'aac', 'mid', 'au', 'aiff',

        # video
        '3gp', 'asf', 'asx', 'avi', 'mov', 'mp4', 'mpg', 'qt', 'rm', 'swf', 'wmv',
        'm4a',

        # office suites
        'xls', 'ppt', 'doc', 'docx', 'odt', 'ods', 'odg', 'odp',
        
        # other
        'css', 'pdf', 'doc', 'exe', 'bin', 'rss', 'zip', 'rar', 'msi', 'asc',
        'dmg', 'bz2', 'ics'
        ])

    @classmethod
    def url_has_ignored_extension(cls, url):
        pr = urlparse.urlparse(url)
        try:
            ext = pr.path.split('.')[-1]
        except IndexError:
            return False
        return ext in cls.IGNORED_EXTENSIONS
    
class HTMLDownloader(object):
    """ downloads any valid url returning text/plain or text/html
    """

    def __init__(self, url, logger):
        # url passed in here is assumed to be safe and valid
        self.url = url
        self.log = logger
        self.data = None
        self.headers={
            'User-Agent': _USERAGENT,
            'Accept': 'text-plain;text-html;' # may not be honored by server !
            }

    def download(self):
        # do we need retries here ?
        req = urllib2.Request(self.url, self.data, self.headers)
        handle = None
        try:
            handle = urllib2.urlopen(req)
        except urllib2.HTTPError as e:
            self.log.elog('HTTP error in opening url ' + self.url + ' code: ' + str(e.code))
        except urllib2.URLError as e:
            self.log.elog('exception in opening url ' + self.url + ' reason: ' + str(e.reason))
        except IOError as e:
            self.log.elog('IOError exception in opening url ' + self.url + ' exception: ' + str(e))
            
        headers = handle.info()
        _m = _ACCEPTED_CONTENTTYPES.search(headers.get('Content-type', ''))
        content_type = _m.group(1) if _m else None
        
        if content_type not in ["plain", "html"]:
            self.log.elog('content-type %s returned is not plain or html. url: %s' % (headers["Content-type"], self.url))
            return ''
        
        if handle is None:
            self.log.elog('urlopen returned None handle for url: ' + self.url)
            return ''

        # all good so far !
        return ''.join(handle.readlines())
    
class HTMLLinkExtractor(HTMLParser.HTMLParser):
    """ takes response content from parsing a link, parses it and extracts
    all the links from it.
    - if the url is not valid as per Utils.is_valid_url, skip it
    - if the url has ignored extension, then skip it
    - if the link tag has nofollow, then skip it
    - if there is a meta tag with noindex in the page, that page is skipped    
    """

    def __init__(self, url, logger):
        HTMLParser.HTMLParser.__init__(self)
        self.url = url
        self.log = logger
        self.urls = []
        self.base_url = None

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            url = None
            for attr, value in attrs:
                if attr == 'href':
                    url = value
                elif attr == 'rel' and value == 'nofollow':
                    return # this link shouldn't be followed
            if url is not None:
                self.urls.append(safe_download_url(url))

        if tag == 'base':
            self.base_url = dict(attrs).get('href')

        if tag == 'meta':
            no_index = dict(attrs).get('content', None)
            if no_index is not None and no_index == 'noindex':
                raise NoIndexMetaTagError

    def skip_url(self, url):
        # check if the passed in url is to be skipped
        if IgnoredExtensions.url_has_ignored_extension(url):
            self.log.dlog('url %s has ignored extension. skipping it.' % url)
            return True
        if not Utils.is_valid_url(url):
            self.log.dlog('url %s is not valid url for download. skipping it.' % url)
            return True
        return False
    
    def extracted_links(self, response_text):
        self.feed(response_text)
        self.close()
        urls = []
        base_url = urlparse.urljoin(self.url, self.base_url) if self.base_url else self.url
        for _u in self.urls:
            url = safe_download_url(urlparse.urljoin(base_url, _u))
            if not self.skip_url(url):
                urls.append(url)
        # TODO: review the following set usage
        return iter(set(urls)) 
    
class CrawlWorker():
    """ worker logic which gets a unit of work(link) from
    Manager , download it, parses it and submits units of work(links)
    to the Manager
    """

    def __init__(self, manager, logger):
        self.manager = manager
        self.log = logger

    def run(self):
        while True:
            self.get_work()
            if self.url is None:
                return # done

            # results DepthExceededError when set depth is exceeded
            self.process_work()

    def get_work(self):
        self.url = self.manager.get_url()

    def get_robotparser(self):
        # ask the manager for robot parser for this domain
        # if none found, get one and give it to manager
        pr = urlparse.urlparse(self.url)
        domain = pr.scheme + '://' + pr.netloc 
        robotparser = self.manager.get_robotparser(domain)
        if robotparser is None:
            robotstxt = domain + '/' + 'robots.txt'
            robotparser = RobotFileParser(robotstxt)
            robotparser.read()
            self.manager.set_robotparser(domain, robotparser)
        return robotparser

    def process_work(self):
        pr = urlparse.urlparse(self.url)

        if pr.scheme != 'file':
            robotparser = self.get_robotparser()
        else:
            robotparser = None
            
        if pr.scheme == 'file' or robotparser.can_fetch(_USERAGENT, self.url):
            d = HTMLDownloader(self.url, self.log)
            response = d.download()
            
            self.manager.set_visited(self.url)
            
            if response != '':
                self.log.dlog('successfully visited %s' % self.url)
                self.manager.incr_successful_fetches()
                try:
                    p = HTMLLinkExtractor(self.url, self.log)

                    try:
                        
                        for u in p.extracted_links(response):
                            if self.manager.not_visited(u):
                                self.manager.add_url(u)
                            else:
                                self.log.dlog('url %s visited already.' % self.url)
                                
                    except NoIndexMetaTagError:
                        self.log.dlog('url %s has meta tag with content=noindex. not parsing it.' % self.url)
                        
                except HTMLParser.HTMLParseError as e:
                    self.log.elog('parse error in url %s exception %s in line %s' % (self.url,  e.msg, e.lineno))
            else:
                self.manager.incr_unsuccessful_fetches()
                self.log.elog('HTMLDownloader empty string as response for url: %s' % self.url)
        else:
            self.log.dlog('url %s skipped as robots.txt in domain has Disallow rule' % self.url)

class CrawlManager(object):
    """ singleton which encapsulates the repository of urls, visited links
    and logic to handle them

    - follows FIFO logic for crawling the urls
    """

    def __init__(self, logger):
        self.urls = []
        self.visited_urls = {}
        self.robotparsers = {}
        self.log = logger
        # seconds to sleep before spawning any request
        self.sleep_interval = 3 
        self.successful_fetches = 0
        self.unsuccessful_fetches = 0
        self.depth = -1 # go upto visiting all the urls

    def add_url(self, url):
        # implements the logic of depth constraint
        if (self.depth > 0 and len(self.visited_urls) >= self.depth):
            self.log.dlog('not queuing any more urls as depth exceeded.')
            raise DepthExceededError
        if self.not_visited(url):
            self.urls.append(url)

    def get_url(self):
        try:
            return self.urls.pop(0)
        except IndexError:
            return None

    def set_visited(self, url):
        self.visited_urls[url] = 1

    def not_visited(self, url):
        return self.visited_urls.get(url, None) is None

    def visited(self, url):
        return self.visited_urls.get(url, None) is not None

    def set_robotparser(self, domain, robotparser):

        if self.robotparsers.get(domain, None) is None:
            self.robotparsers[domain] = robotparser
        else:
            self.log.elog('attempt to override a robotparser made.')

    def get_robotparser(self, domain):
        return self.robotparsers.get(domain, None)

    def incr_successful_fetches(self):
        self.successful_fetches += 1

    def incr_unsuccessful_fetches(self):
        self.unsuccessful_fetches += 1

    def start(self, depth):
        # if depth = -1, continue crawling till interrupted or no more urls
        # else go upto the depth and return
        self.depth = depth
        self.log.dlog('started crawling')

        try:
            # start calling crawl workers to do the work
            cw = CrawlWorker(self, self.log)
            cw.run()
        except DepthExceededError:
            pass

        self.log.dlog('stopped crawling')

    def stats(self):
        print '#successful fetches: ', self.successful_fetches
        print '#unsuccessful fetches: ', self.unsuccessful_fetches
    
class ToyCrawler(object):

    def __init__(self, seed, depth, debuglog):
        self.seed = seed
        self.depth = depth
        self.log = Logger('toycrawler.log', debuglog)
        self.crawl_manager = CrawlManager(logger=self.log)

    def _sanitytest(self):
        # sanity test the seed
        if not Utils.is_valid_url(self.seed):
            raise ImproperSeedError
        
    def start(self):
        """ start crawling
        """
        self._sanitytest()
        self.crawl_manager.add_url(self.seed)
        self.crawl_manager.start(depth=self.depth)
        self.crawl_manager.stats()

    # this makes ToyCrawler as callable to be used as interrput handler,
    # print stats and exit cleanly
    def __call__(self, signum, frame):
        print 'shutdown requested. stopping crawling.'
        self.log.dlog('stopped crawling')
        self.crawl_manager.stats()
        sys.exit(0)

def parse_command_line():
    """ sets up the required command line arguments parsing logic
    returns: ArgumentParser object
    """
    
    args_parser = argparse.ArgumentParser(
        description='Toy Crawler - that just crawls webpages from the given seed url.',
        epilog='Stop with Keyboard interrupt (Ctrl-C) if you want stop crawling in between.')
    args_parser.add_argument('--seed', dest='seed', required=True, help='seed url to start the crawling with. format: scheme://netloc:port/path')
    args_parser.add_argument('--depth', dest='depth', default=-1, help='number of links to crawl up to. if 100, first 100 links from seed url are crawled.')
    args_parser.add_argument('--debug', dest='debuglog', default=0, help='enable debug logs? pass 1 if yes.', choices=['1'])
    return args_parser.parse_args()
        
def main():
    args = parse_command_line()
    t = ToyCrawler(args.seed, int(args.depth), int(args.debuglog))
    register_signal_handlers(t)
    t.start()
    sys.exit(0)

if __name__ == '__main__':
    main()

No comments:

Post a Comment