Source code for kibitzr.transformer.html
import sys
import contextlib
import logging
import functools
import six
logger = logging.getLogger(__name__)
[docs]class SoupOps:
def __init__(self, selector=None, select_all=False):
self.selector = selector
self.select_all = select_all
[docs] def tag_selector(self, html):
with soup(html) as doc:
element = doc.find(self.selector)
if element:
return True, six.text_type(element)
else:
logger.warning('Tag not found: %r', self.selector)
return False, html
[docs] def css_selector(self, html):
with soup(html) as doc:
try:
elements = doc.select(self.selector)
if self.select_all:
result = u"".join(six.text_type(x)
for x in elements)
else:
result = six.text_type(elements[0])
return True, result
except IndexError:
logger.warning('CSS selector not found: %r', self.selector)
return False, html
[docs] @classmethod
def factory(cls, key, value, conf):
def transform(content):
instance = cls(selector=value, select_all=select_all)
method = handler.__get__(instance, cls)
return method(content)
action, _, all_flag = key.partition('-')
select_all = (all_flag == 'all')
handler = cls.SHORTCUTS[action]
return transform
SHORTCUTS = {
'tag': tag_selector,
'css': css_selector,
'text': extract_text,
}
[docs]@contextlib.contextmanager
def soup(html):
from bs4 import BeautifulSoup # pylint: disable=import-outside-toplevel
with deep_recursion():
yield BeautifulSoup(html, "html.parser")
[docs]@contextlib.contextmanager
def deep_recursion():
old_limit = sys.getrecursionlimit()
try:
sys.setrecursionlimit(100000)
yield
finally:
sys.setrecursionlimit(old_limit)
[docs]def bake_html(key):
return functools.partial(SoupOps.factory, key)
[docs]def register():
"""
Return dictionary of transform factories
"""
registry = {
key: bake_html(key)
for key in ('css', 'css-all', 'tag', 'text')
}
return registry