Source code for kibitzr.transformer.html

import sys
import contextlib
import logging
import functools

import six


logger = logging.getLogger(__name__)


[docs]class SoupOps:
    def __init__(self, selector=None, select_all=False):
        self.selector = selector
        self.select_all = select_all

[docs]    def tag_selector(self, html):
        with soup(html) as doc:
            element = doc.find(self.selector)
            if element:
                return True, six.text_type(element)
            else:
                logger.warning('Tag not found: %r', self.selector)
                return False, html

[docs]    def css_selector(self, html):
        with soup(html) as doc:
            try:
                elements = doc.select(self.selector)
                if self.select_all:
                    result = u"".join(six.text_type(x)
                                      for x in elements)
                else:
                    result = six.text_type(elements[0])
                return True, result
            except IndexError:
                logger.warning('CSS selector not found: %r', self.selector)
                return False, html

[docs]    @staticmethod
    def extract_text(html):
        with soup(html) as doc:
            strings = doc.stripped_strings
            return True, u'\n'.join([
                line
                for line in strings
                if line
            ])

[docs]    @classmethod
    def factory(cls, key, value, conf):
        def transform(content):
            instance = cls(selector=value, select_all=select_all)
            method = handler.__get__(instance, cls)
            return method(content)
        action, _, all_flag = key.partition('-')
        select_all = (all_flag == 'all')
        handler = cls.SHORTCUTS[action]
        return transform

    SHORTCUTS = {
        'tag': tag_selector,
        'css': css_selector,
        'text': extract_text,
    }


[docs]@contextlib.contextmanager
def soup(html):
    from bs4 import BeautifulSoup  # pylint: disable=import-outside-toplevel
    with deep_recursion():
        yield BeautifulSoup(html, "html.parser")


[docs]@contextlib.contextmanager
def deep_recursion():
    old_limit = sys.getrecursionlimit()
    try:
        sys.setrecursionlimit(100000)
        yield
    finally:
        sys.setrecursionlimit(old_limit)


[docs]def bake_html(key):
    return functools.partial(SoupOps.factory, key)


[docs]def register():
    """
    Return dictionary of transform factories
    """
    registry = {
        key: bake_html(key)
        for key in ('css', 'css-all', 'tag', 'text')
    }
    return registry
Source code for kibitzr.transformer.html

kibitzr

Navigation

Related Topics