Source code for kibitzr.transformer.html

import sys
import contextlib
import logging
import functools

import six


logger = logging.getLogger(__name__)


[docs]class SoupOps: def __init__(self, selector=None, select_all=False): self.selector = selector self.select_all = select_all
[docs] def tag_selector(self, html): with soup(html) as doc: element = doc.find(self.selector) if element: return True, six.text_type(element) else: logger.warning('Tag not found: %r', self.selector) return False, html
[docs] def css_selector(self, html): with soup(html) as doc: try: elements = doc.select(self.selector) if self.select_all: result = u"".join(six.text_type(x) for x in elements) else: result = six.text_type(elements[0]) return True, result except IndexError: logger.warning('CSS selector not found: %r', self.selector) return False, html
[docs] @staticmethod def extract_text(html): with soup(html) as doc: strings = doc.stripped_strings return True, u'\n'.join([ line for line in strings if line ])
[docs] @classmethod def factory(cls, key, value, conf): def transform(content): instance = cls(selector=value, select_all=select_all) method = handler.__get__(instance, cls) return method(content) action, _, all_flag = key.partition('-') select_all = (all_flag == 'all') handler = cls.SHORTCUTS[action] return transform
SHORTCUTS = { 'tag': tag_selector, 'css': css_selector, 'text': extract_text, }
[docs]@contextlib.contextmanager def soup(html): from bs4 import BeautifulSoup # pylint: disable=import-outside-toplevel with deep_recursion(): yield BeautifulSoup(html, "html.parser")
[docs]@contextlib.contextmanager def deep_recursion(): old_limit = sys.getrecursionlimit() try: sys.setrecursionlimit(100000) yield finally: sys.setrecursionlimit(old_limit)
[docs]def bake_html(key): return functools.partial(SoupOps.factory, key)
[docs]def register(): """ Return dictionary of transform factories """ registry = { key: bake_html(key) for key in ('css', 'css-all', 'tag', 'text') } return registry