diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py
index 7ba34269..fcc27457 100644
--- a/lib/bs4/__init__.py
+++ b/lib/bs4/__init__.py
@@ -5,26 +5,30 @@ http://www.crummy.com/software/BeautifulSoup/
Beautiful Soup uses a pluggable XML or HTML parser to parse a
(possibly invalid) document into a tree representation. Beautiful Soup
-provides provides methods and Pythonic idioms that make it easy to
-navigate, search, and modify the parse tree.
+provides methods and Pythonic idioms that make it easy to navigate,
+search, and modify the parse tree.
-Beautiful Soup works with Python 2.6 and up. It works better if lxml
+Beautiful Soup works with Python 2.7 and up. It works better if lxml
and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the
documentation:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
+
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.3.2"
-__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
+__version__ = "4.8.1"
+__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
+# Use of this source code is governed by the MIT license.
__license__ = "MIT"
__all__ = ['BeautifulSoup']
import os
import re
+import sys
+import traceback
import warnings
from .builder import builder_registry, ParserRejectedMarkup
@@ -45,7 +49,7 @@ from .element import (
# The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it.
-syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag):
"""
@@ -59,7 +63,7 @@ class BeautifulSoup(Tag):
handle_starttag(name, attrs) # See note about return value
handle_endtag(name)
handle_data(data) # Appends to the current data node
- endData(containerClass=NavigableString) # Ends the current data node
+ endData(containerClass) # Ends the current data node
No matter how complicated the underlying parser is, you should be
able to build a tree using 'start tag' events, 'end tag' events,
@@ -69,21 +73,70 @@ class BeautifulSoup(Tag):
like HTML's tag), call handle_starttag and then
handle_endtag.
"""
- ROOT_TAG_NAME = u'[document]'
+ ROOT_TAG_NAME = '[document]'
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
-
+
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
+ NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
+
def __init__(self, markup="", features=None, builder=None,
- parse_only=None, from_encoding=None, **kwargs):
- """The Soup object is initialized as the 'root tag', and the
- provided markup (which can be a string or a file-like object)
- is fed into the underlying parser."""
+ parse_only=None, from_encoding=None, exclude_encodings=None,
+ element_classes=None, **kwargs):
+ """Constructor.
+
+ :param markup: A string or a file-like object representing
+ markup to be parsed.
+
+ :param features: Desirable features of the parser to be used. This
+ may be the name of a specific parser ("lxml", "lxml-xml",
+ "html.parser", or "html5lib") or it may be the type of markup
+ to be used ("html", "html5", "xml"). It's recommended that you
+ name a specific parser, so that Beautiful Soup gives you the
+ same results across platforms and virtual environments.
+
+ :param builder: A TreeBuilder subclass to instantiate (or
+ instance to use) instead of looking one up based on
+ `features`. You only need to use this if you've implemented a
+ custom TreeBuilder.
+
+ :param parse_only: A SoupStrainer. Only parts of the document
+ matching the SoupStrainer will be considered. This is useful
+ when parsing part of a document that would otherwise be too
+ large to fit into memory.
+
+ :param from_encoding: A string indicating the encoding of the
+ document to be parsed. Pass this in if Beautiful Soup is
+ guessing wrongly about the document's encoding.
+
+ :param exclude_encodings: A list of strings indicating
+ encodings known to be wrong. Pass this in if you don't know
+ the document's encoding but you know Beautiful Soup's guess is
+ wrong.
+
+ :param element_classes: A dictionary mapping BeautifulSoup
+ classes like Tag and NavigableString to other classes you'd
+ like to be instantiated instead as the parse tree is
+ built. This is useful for using subclasses to modify the
+ default behavior of Tag or NavigableString.
+
+ :param kwargs: For backwards compatibility purposes, the
+ constructor accepts certain keyword arguments used in
+ Beautiful Soup 3. None of these arguments do anything in
+ Beautiful Soup 4; they will result in a warning and then be ignored.
+
+ Apart from this, any keyword arguments passed into the BeautifulSoup
+ constructor are propagated to the TreeBuilder constructor. This
+ makes it possible to configure a TreeBuilder beyond saying
+ which one to use.
+
+ """
if 'convertEntities' in kwargs:
+ del kwargs['convertEntities']
warnings.warn(
"BS4 does not respect the convertEntities argument to the "
"BeautifulSoup constructor. Entities are always converted "
@@ -114,9 +167,9 @@ class BeautifulSoup(Tag):
del kwargs['isHTML']
warnings.warn(
"BS4 does not respect the isHTML argument to the "
- "BeautifulSoup constructor. You can pass in features='html' "
- "or features='xml' to get a builder capable of handling "
- "one or the other.")
+ "BeautifulSoup constructor. Suggest you use "
+ "features='lxml' for HTML and features='lxml-xml' for "
+ "XML.")
def deprecated_argument(old_name, new_name):
if old_name in kwargs:
@@ -134,13 +187,24 @@ class BeautifulSoup(Tag):
from_encoding = from_encoding or deprecated_argument(
"fromEncoding", "from_encoding")
- if len(kwargs) > 0:
- arg = kwargs.keys().pop()
- raise TypeError(
- "__init__() got an unexpected keyword argument '%s'" % arg)
+ if from_encoding and isinstance(markup, str):
+ warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
+ from_encoding = None
- if builder is None:
- if isinstance(features, basestring):
+ self.element_classes = element_classes or dict()
+
+ # We need this information to track whether or not the builder
+ # was specified well enough that we can omit the 'you need to
+ # specify a parser' warning.
+ original_builder = builder
+ original_features = features
+
+ if isinstance(builder, type):
+ # A builder class was passed in; it needs to be instantiated.
+ builder_class = builder
+ builder = None
+ elif builder is None:
+ if isinstance(features, str):
features = [features]
if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES
@@ -150,21 +214,73 @@ class BeautifulSoup(Tag):
"Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?"
% ",".join(features))
- builder = builder_class()
+
+ # At this point either we have a TreeBuilder instance in
+ # builder, or we have a builder_class that we can instantiate
+ # with the remaining **kwargs.
+ if builder is None:
+ builder = builder_class(**kwargs)
+ if not original_builder and not (
+ original_features == builder.NAME or
+ original_features in builder.ALTERNATE_NAMES
+ ):
+ if builder.is_xml:
+ markup_type = "XML"
+ else:
+ markup_type = "HTML"
+
+ # This code adapted from warnings.py so that we get the same line
+ # of code as our warnings.warn() call gets, even if the answer is wrong
+ # (as it may be in a multithreading situation).
+ caller = None
+ try:
+ caller = sys._getframe(1)
+ except ValueError:
+ pass
+ if caller:
+ globals = caller.f_globals
+ line_number = caller.f_lineno
+ else:
+ globals = sys.__dict__
+ line_number= 1
+ filename = globals.get('__file__')
+ if filename:
+ fnl = filename.lower()
+ if fnl.endswith((".pyc", ".pyo")):
+ filename = filename[:-1]
+ if filename:
+ # If there is no filename at all, the user is most likely in a REPL,
+ # and the warning is not necessary.
+ values = dict(
+ filename=filename,
+ line_number=line_number,
+ parser=builder.NAME,
+ markup_type=markup_type
+ )
+ warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
+ else:
+ if kwargs:
+ warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
+
self.builder = builder
self.is_xml = builder.is_xml
- self.builder.soup = self
-
+ self.known_xml = self.is_xml
+ self._namespaces = dict()
self.parse_only = parse_only
+ self.builder.initialize_soup(self)
+
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
- elif len(markup) <= 256:
+ elif len(markup) <= 256 and (
+ (isinstance(markup, bytes) and not b'<' in markup)
+ or (isinstance(markup, str) and not '<' in markup)
+ ):
# Print out warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants.
- if (isinstance(markup, unicode)
+ if (isinstance(markup, str)
and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8")
else:
@@ -172,37 +288,93 @@ class BeautifulSoup(Tag):
is_file = False
try:
is_file = os.path.exists(possible_filename)
- except Exception, e:
+ except Exception as e:
# This is almost certainly a problem involving
# characters not valid in filenames on this
# system. Just let it go.
pass
if is_file:
+ if isinstance(markup, str):
+ markup = markup.encode("utf8")
warnings.warn(
- '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
- if markup[:5] == "http:" or markup[:6] == "https:":
- # TODO: This is ugly but I couldn't get it to work in
- # Python 3 otherwise.
- if ((isinstance(markup, bytes) and not b' ' in markup)
- or (isinstance(markup, unicode) and not u' ' in markup)):
- warnings.warn(
- '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
+ '"%s" looks like a filename, not markup. You should'
+ ' probably open this file and pass the filehandle into'
+ ' Beautiful Soup.' % markup)
+ self._check_markup_is_url(markup)
+ rejections = []
+ success = False
for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in (
- self.builder.prepare_markup(markup, from_encoding)):
+ self.builder.prepare_markup(
+ markup, from_encoding, exclude_encodings=exclude_encodings)):
self.reset()
try:
self._feed()
+ success = True
break
- except ParserRejectedMarkup:
+ except ParserRejectedMarkup as e:
+ rejections.append(e)
pass
+ if not success:
+ other_exceptions = [str(e) for e in rejections]
+ raise ParserRejectedMarkup(
+ "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
+ )
+
# Clear out the markup and remove the builder's circular
# reference to this object.
self.markup = None
self.builder.soup = None
+ def __copy__(self):
+ copy = type(self)(
+ self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
+ )
+
+ # Although we encoded the tree to UTF-8, that may not have
+ # been the encoding of the original markup. Set the copy's
+ # .original_encoding to reflect the original object's
+ # .original_encoding.
+ copy.original_encoding = self.original_encoding
+ return copy
+
+ def __getstate__(self):
+ # Frequently a tree builder can't be pickled.
+ d = dict(self.__dict__)
+ if 'builder' in d and not self.builder.picklable:
+ d['builder'] = None
+ return d
+
+ @staticmethod
+ def _check_markup_is_url(markup):
+ """
+ Check if markup looks like it's actually a url and raise a warning
+ if so. Markup can be unicode or str (py2) / bytes (py3).
+ """
+ if isinstance(markup, bytes):
+ space = b' '
+ cant_start_with = (b"http:", b"https:")
+ elif isinstance(markup, str):
+ space = ' '
+ cant_start_with = ("http:", "https:")
+ else:
+ return
+
+ if any(markup.startswith(prefix) for prefix in cant_start_with):
+ if not space in markup:
+ if isinstance(markup, bytes):
+ decoded_markup = markup.decode('utf-8', 'replace')
+ else:
+ decoded_markup = markup
+ warnings.warn(
+ '"%s" looks like a URL. Beautiful Soup is not an'
+ ' HTTP client. You should probably use an HTTP client like'
+ ' requests to get the document behind the URL, and feed'
+ ' that document to Beautiful Soup.' % decoded_markup
+ )
+
def _feed(self):
# Convert the document to Unicode.
self.builder.reset()
@@ -223,15 +395,21 @@ class BeautifulSoup(Tag):
self.preserve_whitespace_tag_stack = []
self.pushTag(self)
- def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
+ def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
+ sourceline=None, sourcepos=None, **kwattrs):
"""Create a new tag associated with this soup."""
- return Tag(None, self.builder, name, namespace, nsprefix, attrs)
+ kwattrs.update(attrs)
+ return self.element_classes.get(Tag, Tag)(
+ None, self.builder, name, namespace, nsprefix, kwattrs,
+ sourceline=sourceline, sourcepos=sourcepos
+ )
- def new_string(self, s, subclass=NavigableString):
+ def new_string(self, s, subclass=None):
"""Create a new NavigableString associated with this soup."""
- navigable = subclass(s)
- navigable.setup()
- return navigable
+ subclass = subclass or self.element_classes.get(
+ NavigableString, NavigableString
+ )
+ return subclass(s)
def insert_before(self, successor):
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
@@ -250,16 +428,26 @@ class BeautifulSoup(Tag):
def pushTag(self, tag):
#print "Push", tag.name
- if self.currentTag:
+ if self.currentTag is not None:
self.currentTag.contents.append(tag)
self.tagStack.append(tag)
self.currentTag = self.tagStack[-1]
if tag.name in self.builder.preserve_whitespace_tags:
self.preserve_whitespace_tag_stack.append(tag)
- def endData(self, containerClass=NavigableString):
+ def endData(self, containerClass=None):
+
+ # Default container is NavigableString.
+ containerClass = containerClass or NavigableString
+
+ # The user may want us to instantiate some alias for the
+ # container class.
+ containerClass = self.element_classes.get(
+ containerClass, containerClass
+ )
+
if self.current_data:
- current_data = u''.join(self.current_data)
+ current_data = ''.join(self.current_data)
# If whitespace is not preserved, and this string contains
# nothing but ASCII spaces, replace it with a single space
# or newline.
@@ -289,15 +477,72 @@ class BeautifulSoup(Tag):
def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree."""
- parent = parent or self.currentTag
- most_recent_element = most_recent_element or self._most_recent_element
- o.setup(parent, most_recent_element)
-
+ if parent is None:
+ parent = self.currentTag
if most_recent_element is not None:
- most_recent_element.next_element = o
+ previous_element = most_recent_element
+ else:
+ previous_element = self._most_recent_element
+
+ next_element = previous_sibling = next_sibling = None
+ if isinstance(o, Tag):
+ next_element = o.next_element
+ next_sibling = o.next_sibling
+ previous_sibling = o.previous_sibling
+ if previous_element is None:
+ previous_element = o.previous_element
+
+ fix = parent.next_element is not None
+
+ o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
+
self._most_recent_element = o
parent.contents.append(o)
+ # Check if we are inserting into an already parsed node.
+ if fix:
+ self._linkage_fixer(parent)
+
+ def _linkage_fixer(self, el):
+ """Make sure linkage of this fragment is sound."""
+
+ first = el.contents[0]
+ child = el.contents[-1]
+ descendant = child
+
+ if child is first and el.parent is not None:
+ # Parent should be linked to first child
+ el.next_element = child
+ # We are no longer linked to whatever this element is
+ prev_el = child.previous_element
+ if prev_el is not None and prev_el is not el:
+ prev_el.next_element = None
+ # First child should be linked to the parent, and no previous siblings.
+ child.previous_element = el
+ child.previous_sibling = None
+
+ # We have no sibling as we've been appended as the last.
+ child.next_sibling = None
+
+ # This index is a tag, dig deeper for a "last descendant"
+ if isinstance(child, Tag) and child.contents:
+ descendant = child._last_descendant(False)
+
+ # As the final step, link last descendant. It should be linked
+ # to the parent's next sibling (if found), else walk up the chain
+ # and find a parent with a sibling. It should have no next sibling.
+ descendant.next_element = None
+ descendant.next_sibling = None
+ target = el
+ while True:
+ if target is None:
+ break
+ elif target.next_sibling is not None:
+ descendant.next_element = target.next_sibling
+ target.next_sibling.previous_element = child
+ break
+ target = target.parent
+
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag
@@ -321,11 +566,12 @@ class BeautifulSoup(Tag):
return most_recently_popped
- def handle_starttag(self, name, namespace, nsprefix, attrs):
+ def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
+ sourcepos=None):
"""Push a start tag on to the stack.
If this method returns None, the tag was rejected by the
- SoupStrainer. You should proceed as if the tag had not occured
+ SoupStrainer. You should proceed as if the tag had not occurred
in the document. For instance, if this was a self-closing tag,
don't call handle_endtag.
"""
@@ -338,11 +584,14 @@ class BeautifulSoup(Tag):
or not self.parse_only.search_tag(name, attrs))):
return None
- tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
- self.currentTag, self._most_recent_element)
+ tag = self.element_classes.get(Tag, Tag)(
+ self, self.builder, name, namespace, nsprefix, attrs,
+ self.currentTag, self._most_recent_element,
+ sourceline=sourceline, sourcepos=sourcepos
+ )
if tag is None:
return tag
- if self._most_recent_element:
+ if self._most_recent_element is not None:
self._most_recent_element.next_element = tag
self._most_recent_element = tag
self.pushTag(tag)
@@ -367,9 +616,9 @@ class BeautifulSoup(Tag):
encoding_part = ''
if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding
- prefix = u'\n' % encoding_part
+ prefix = '\n' % encoding_part
else:
- prefix = u''
+ prefix = ''
if not pretty_print:
indent_level = None
else:
@@ -403,4 +652,4 @@ class FeatureNotFound(ValueError):
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)
- print soup.prettify()
+ print(soup.prettify())
diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py
index 740f5f29..03a4c1e0 100644
--- a/lib/bs4/builder/__init__.py
+++ b/lib/bs4/builder/__init__.py
@@ -1,10 +1,13 @@
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
from collections import defaultdict
import itertools
import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
- whitespace_re
+ nonwhitespace_re
)
__all__ = [
@@ -80,21 +83,70 @@ builder_registry = TreeBuilderRegistry()
class TreeBuilder(object):
"""Turn a document into a Beautiful Soup object tree."""
+ NAME = "[Unknown tree builder]"
+ ALTERNATE_NAMES = []
features = []
is_xml = False
- preserve_whitespace_tags = set()
+ picklable = False
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
-
+
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
- cdata_list_attributes = {}
+ DEFAULT_CDATA_LIST_ATTRIBUTES = {}
+ DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
+
+ USE_DEFAULT = object()
- def __init__(self):
+ # Most parsers don't keep track of line numbers.
+ TRACKS_LINE_NUMBERS = False
+
+ def __init__(self, multi_valued_attributes=USE_DEFAULT,
+ preserve_whitespace_tags=USE_DEFAULT,
+ store_line_numbers=USE_DEFAULT):
+ """Constructor.
+
+ :param multi_valued_attributes: If this is set to None, the
+ TreeBuilder will not turn any values for attributes like
+ 'class' into lists. Setting this do a dictionary will
+ customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
+ for an example.
+
+ Internally, these are called "CDATA list attributes", but that
+ probably doesn't make sense to an end-user, so the argument name
+ is `multi_valued_attributes`.
+
+ :param preserve_whitespace_tags: A list of tags to treat
+ the way
tags are treated in HTML. Tags in this list
+ will have
+
+ :param store_line_numbers: If the parser keeps track of the
+ line numbers and positions of the original markup, that
+ information will, by default, be stored in each corresponding
+ `Tag` object. You can turn this off by passing
+ store_line_numbers=False. If the parser you're using doesn't
+ keep track of this information, then setting store_line_numbers=True
+ will do nothing.
+ """
self.soup = None
-
+ if multi_valued_attributes is self.USE_DEFAULT:
+ multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
+ self.cdata_list_attributes = multi_valued_attributes
+ if preserve_whitespace_tags is self.USE_DEFAULT:
+ preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
+ self.preserve_whitespace_tags = preserve_whitespace_tags
+ if store_line_numbers == self.USE_DEFAULT:
+ store_line_numbers = self.TRACKS_LINE_NUMBERS
+ self.store_line_numbers = store_line_numbers
+
+ def initialize_soup(self, soup):
+ """The BeautifulSoup object has been initialized and is now
+ being associated with the TreeBuilder.
+ """
+ self.soup = soup
+
def reset(self):
pass
@@ -118,13 +170,13 @@ class TreeBuilder(object):
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
-
+
def feed(self, markup):
raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None,
- document_declared_encoding=None):
- return markup, None, None, False
+ document_declared_encoding=None, exclude_encodings=None):
+ yield markup, None, None, False
def test_fragment_to_document(self, fragment):
"""Wrap an HTML fragment to make it look like a document.
@@ -153,14 +205,14 @@ class TreeBuilder(object):
universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), None)
- for attr in attrs.keys():
+ for attr in list(attrs.keys()):
if attr in universal or (tag_specific and attr in tag_specific):
# We have a "class"-type attribute whose string
# value is a whitespace-separated list of
# values. Split it into a list.
value = attrs[attr]
- if isinstance(value, basestring):
- values = whitespace_re.split(value)
+ if isinstance(value, str):
+ values = nonwhitespace_re.findall(value)
else:
# html5lib sometimes calls setAttributes twice
# for the same tag when rearranging the parse
@@ -224,10 +276,20 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags.
"""
- preserve_whitespace_tags = set(['pre', 'textarea'])
- empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
- 'spacer', 'link', 'frame', 'base'])
+ empty_element_tags = set([
+ # These are from HTML5.
+ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
+
+ # These are from earlier versions of HTML and are removed in HTML5.
+ 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
+ ])
+ # The HTML standard defines these as block-level elements. Beautiful
+ # Soup does not treat these elements differently from other elements,
+ # but it may do so eventually, and this information is available if
+ # you need to use it.
+ block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
+
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values,
@@ -235,7 +297,7 @@ class HTMLTreeBuilder(TreeBuilder):
# encounter one of these attributes, we will parse its value into
# a list of values if possible. Upon output, the list will be
# converted back into a string.
- cdata_list_attributes = {
+ DEFAULT_CDATA_LIST_ATTRIBUTES = {
"*" : ['class', 'accesskey', 'dropzone'],
"a" : ['rel', 'rev'],
"link" : ['rel', 'rev'],
@@ -252,6 +314,8 @@ class HTMLTreeBuilder(TreeBuilder):
"output" : ["for"],
}
+ DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+
def set_up_substitutions(self, tag):
# We are only interested in tags
if tag.name != 'meta':
@@ -299,8 +363,15 @@ def register_treebuilders_from(module):
this_module.builder_registry.register(obj)
class ParserRejectedMarkup(Exception):
- pass
-
+ def __init__(self, message_or_exception):
+ """Explain why the parser rejected the given markup, either
+ with a textual explanation or another exception.
+ """
+ if isinstance(message_or_exception, Exception):
+ e = message_or_exception
+ message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
+ super(ParserRejectedMarkup, self).__init__(message_or_exception)
+
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only
diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py
index d46b695b..43199189 100644
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@@ -1,17 +1,27 @@
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
__all__ = [
'HTML5TreeBuilder',
]
import warnings
+import re
from bs4.builder import (
PERMISSIVE,
HTML,
HTML_5,
HTMLTreeBuilder,
)
-from bs4.element import NamespacedAttribute
+from bs4.element import (
+ NamespacedAttribute,
+ nonwhitespace_re,
+)
import html5lib
-from html5lib.constants import namespaces
+from html5lib.constants import (
+ namespaces,
+ prefixes,
+ )
from bs4.element import (
Comment,
Doctype,
@@ -19,14 +29,36 @@ from bs4.element import (
Tag,
)
+try:
+ # Pre-0.99999999
+ from html5lib.treebuilders import _base as treebuilder_base
+ new_html5lib = False
+except ImportError as e:
+ # 0.99999999 and up
+ from html5lib.treebuilders import base as treebuilder_base
+ new_html5lib = True
+
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
- features = ['html5lib', PERMISSIVE, HTML_5, HTML]
+ NAME = "html5lib"
- def prepare_markup(self, markup, user_specified_encoding):
+ features = [NAME, PERMISSIVE, HTML_5, HTML]
+
+ # html5lib can tell us which line number and position in the
+ # original file is the source of an element.
+ TRACKS_LINE_NUMBERS = True
+
+ def prepare_markup(self, markup, user_specified_encoding,
+ document_declared_encoding=None, exclude_encodings=None):
# Store the user-specified encoding for use later on.
self.user_specified_encoding = user_specified_encoding
+
+ # document_declared_encoding and exclude_encodings aren't used
+ # ATM because the html5lib TreeBuilder doesn't use
+ # UnicodeDammit.
+ if exclude_encodings:
+ warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
yield (markup, None, None, False)
# These methods are defined by Beautiful Soup.
@@ -34,32 +66,63 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
- doc = parser.parse(markup, encoding=self.user_specified_encoding)
-
+ self.underlying_builder.parser = parser
+ extra_kwargs = dict()
+ if not isinstance(markup, str):
+ if new_html5lib:
+ extra_kwargs['override_encoding'] = self.user_specified_encoding
+ else:
+ extra_kwargs['encoding'] = self.user_specified_encoding
+ doc = parser.parse(markup, **extra_kwargs)
+
# Set the character encoding detected by the tokenizer.
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
- doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
-
+ original_encoding = parser.tokenizer.stream.charEncoding[0]
+ if not isinstance(original_encoding, str):
+ # In 0.99999999 and up, the encoding is an html5lib
+ # Encoding object. We want to use a string for compatibility
+ # with other tree builders.
+ original_encoding = original_encoding.name
+ doc.original_encoding = original_encoding
+ self.underlying_builder.parser = None
+
def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib(
- self.soup, namespaceHTMLElements)
+ namespaceHTMLElements, self.soup,
+ store_line_numbers=self.store_line_numbers
+ )
return self.underlying_builder
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
- return u'%s' % fragment
+ return '%s' % fragment
-class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
-
- def __init__(self, soup, namespaceHTMLElements):
- self.soup = soup
+class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
+
+ def __init__(self, namespaceHTMLElements, soup=None,
+ store_line_numbers=True, **kwargs):
+ if soup:
+ self.soup = soup
+ else:
+ from bs4 import BeautifulSoup
+ # TODO: Why is the parser 'html.parser' here? To avoid an
+ # infinite loop?
+ self.soup = BeautifulSoup(
+ "", "html.parser", store_line_numbers=store_line_numbers,
+ **kwargs
+ )
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
+ # This will be set later to an html5lib.html5parser.HTMLParser
+ # object, which we can use to track the current line number.
+ self.parser = None
+ self.store_line_numbers = store_line_numbers
+
def documentClass(self):
self.soup.reset()
return Element(self.soup, self.soup, None)
@@ -73,14 +136,26 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
self.soup.object_was_parsed(doctype)
def elementClass(self, name, namespace):
- tag = self.soup.new_tag(name, namespace)
+ kwargs = {}
+ if self.parser and self.store_line_numbers:
+ # This represents the point immediately after the end of the
+ # tag. We don't know when the tag started, but we do know
+ # where it ended -- the character just before this one.
+ sourceline, sourcepos = self.parser.tokenizer.stream.position()
+ kwargs['sourceline'] = sourceline
+ kwargs['sourcepos'] = sourcepos-1
+ tag = self.soup.new_tag(name, namespace, **kwargs)
+
return Element(tag, self.soup, namespace)
def commentClass(self, data):
return TextNode(Comment(data), self.soup)
def fragmentClass(self):
- self.soup = BeautifulSoup("")
+ from bs4 import BeautifulSoup
+ # TODO: Why is the parser 'html.parser' here? To avoid an
+ # infinite loop?
+ self.soup = BeautifulSoup("", "html.parser")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None)
@@ -92,7 +167,57 @@ class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
return self.soup
def getFragment(self):
- return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+ return treebuilder_base.TreeBuilder.getFragment(self).element
+
+ def testSerializer(self, element):
+ from bs4 import BeautifulSoup
+ rv = []
+ doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
+
+ def serializeElement(element, indent=0):
+ if isinstance(element, BeautifulSoup):
+ pass
+ if isinstance(element, Doctype):
+ m = doctype_re.match(element)
+ if m:
+ name = m.group(1)
+ if m.lastindex > 1:
+ publicId = m.group(2) or ""
+ systemId = m.group(3) or m.group(4) or ""
+ rv.append("""|%s""" %
+ (' ' * indent, name, publicId, systemId))
+ else:
+ rv.append("|%s" % (' ' * indent, name))
+ else:
+ rv.append("|%s" % (' ' * indent,))
+ elif isinstance(element, Comment):
+ rv.append("|%s" % (' ' * indent, element))
+ elif isinstance(element, NavigableString):
+ rv.append("|%s\"%s\"" % (' ' * indent, element))
+ else:
+ if element.namespace:
+ name = "%s %s" % (prefixes[element.namespace],
+ element.name)
+ else:
+ name = element.name
+ rv.append("|%s<%s>" % (' ' * indent, name))
+ if element.attrs:
+ attributes = []
+ for name, value in list(element.attrs.items()):
+ if isinstance(name, NamespacedAttribute):
+ name = "%s %s" % (prefixes[name.namespace], name.name)
+ if isinstance(value, list):
+ value = " ".join(value)
+ attributes.append((name, value))
+
+ for name, value in sorted(attributes):
+ rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+ indent += 2
+ for child in element.children:
+ serializeElement(child, indent)
+ serializeElement(element, 0)
+
+ return "\n".join(rv)
class AttrList(object):
def __init__(self, element):
@@ -101,7 +226,16 @@ class AttrList(object):
def __iter__(self):
return list(self.attrs.items()).__iter__()
def __setitem__(self, name, value):
- "set attr", name, value
+ # If this attribute is a multi-valued attribute for this element,
+ # turn its value into a list.
+ list_attr = self.element.cdata_list_attributes
+ if (name in list_attr['*']
+ or (self.element.name in list_attr
+ and name in list_attr[self.element.name])):
+ # A node that is being cloned may have already undergone
+ # this procedure.
+ if not isinstance(value, list):
+ value = nonwhitespace_re.findall(value)
self.element[name] = value
def items(self):
return list(self.attrs.items())
@@ -115,16 +249,16 @@ class AttrList(object):
return name in list(self.attrs.keys())
-class Element(html5lib.treebuilders._base.Node):
+class Element(treebuilder_base.Node):
def __init__(self, element, soup, namespace):
- html5lib.treebuilders._base.Node.__init__(self, element.name)
+ treebuilder_base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
def appendChild(self, node):
string_child = child = None
- if isinstance(node, basestring):
+ if isinstance(node, str):
# Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the
# string.
@@ -136,13 +270,15 @@ class Element(html5lib.treebuilders._base.Node):
child = node
elif node.element.__class__ == NavigableString:
string_child = child = node.element
+ node.parent = self
else:
child = node.element
+ node.parent = self
- if not isinstance(child, basestring) and child.parent is not None:
+ if not isinstance(child, str) and child.parent is not None:
node.element.extract()
- if (string_child and self.element.contents
+ if (string_child is not None and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# We are appending a string onto another string.
# TODO This has O(n^2) performance, for input like
@@ -152,7 +288,7 @@ class Element(html5lib.treebuilders._base.Node):
old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else:
- if isinstance(node, basestring):
+ if isinstance(node, str):
# Create a brand new NavigableString from this string.
child = self.soup.new_string(node)
@@ -161,6 +297,12 @@ class Element(html5lib.treebuilders._base.Node):
# immediately after the parent, if it has no children.)
if self.element.contents:
most_recent_element = self.element._last_descendant(False)
+ elif self.element.next_element is not None:
+ # Something from further ahead in the parse tree is
+ # being inserted into this earlier element. This is
+ # very annoying because it means an expensive search
+ # for the last element in the tree.
+ most_recent_element = self.soup._last_descendant()
else:
most_recent_element = self.element
@@ -169,9 +311,12 @@ class Element(html5lib.treebuilders._base.Node):
most_recent_element=most_recent_element)
def getAttributes(self):
+ if isinstance(self.element, Comment):
+ return {}
return AttrList(self.element)
def setAttributes(self, attributes):
+
if attributes is not None and len(attributes) > 0:
converted_attributes = []
@@ -183,7 +328,7 @@ class Element(html5lib.treebuilders._base.Node):
self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes)
- for name, value in attributes.items():
+ for name, value in list(attributes.items()):
self.element[name] = value
# The attributes may contain variables that need substitution.
@@ -195,11 +340,11 @@ class Element(html5lib.treebuilders._base.Node):
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
+ text = TextNode(self.soup.new_string(data), self.soup)
if insertBefore:
- text = TextNode(self.soup.new_string(data), self.soup)
- self.insertBefore(data, insertBefore)
+ self.insertBefore(text, insertBefore)
else:
- self.appendChild(data)
+ self.appendChild(text)
def insertBefore(self, node, refNode):
index = self.element.index(refNode.element)
@@ -218,6 +363,10 @@ class Element(html5lib.treebuilders._base.Node):
def reparentChildren(self, new_parent):
"""Move all of this tag's children into another tag."""
+ # print "MOVE", self.element.contents
+ # print "FROM", self.element
+ # print "TO", new_parent.element
+
element = self.element
new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children
@@ -236,18 +385,35 @@ class Element(html5lib.treebuilders._base.Node):
new_parents_last_descendant_next_element = new_parent_element.next_element
to_append = element.contents
- append_after = new_parent.element.contents
if len(to_append) > 0:
# Set the first child's previous_element and previous_sibling
# to elements within the new parent
first_child = to_append[0]
- first_child.previous_element = new_parents_last_descendant
+ if new_parents_last_descendant is not None:
+ first_child.previous_element = new_parents_last_descendant
+ else:
+ first_child.previous_element = new_parent_element
first_child.previous_sibling = new_parents_last_child
+ if new_parents_last_descendant is not None:
+ new_parents_last_descendant.next_element = first_child
+ else:
+ new_parent_element.next_element = first_child
+ if new_parents_last_child is not None:
+ new_parents_last_child.next_sibling = first_child
- # Fix the last child's next_element and next_sibling
- last_child = to_append[-1]
- last_child.next_element = new_parents_last_descendant_next_element
- last_child.next_sibling = None
+ # Find the very last element being moved. It is now the
+ # parent's last descendant. It has no .next_sibling and
+ # its .next_element is whatever the previous last
+ # descendant had.
+ last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
+
+ last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
+ if new_parents_last_descendant_next_element is not None:
+ # TODO: This code has no test coverage and I'm not sure
+ # how to get html5lib to go through this path, but it's
+ # just the other side of the previous line.
+ new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
+ last_childs_last_descendant.next_sibling = None
for child in to_append:
child.parent = new_parent_element
@@ -257,6 +423,10 @@ class Element(html5lib.treebuilders._base.Node):
element.contents = []
element.next_element = final_next_element
+ # print "DONE WITH MOVE"
+ # print "FROM", self.element
+ # print "TO", new_parent_element
+
def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace)
node = Element(tag, self.soup, self.namespace)
@@ -268,7 +438,7 @@ class Element(html5lib.treebuilders._base.Node):
return self.element.contents
def getNameTuple(self):
- if self.namespace is None:
+ if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
@@ -277,7 +447,7 @@ class Element(html5lib.treebuilders._base.Node):
class TextNode(Element):
def __init__(self, element, soup):
- html5lib.treebuilders._base.Node.__init__(self, None)
+ treebuilder_base.Node.__init__(self, None)
self.element = element
self.soup = soup
diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py
index ca8d8b89..12e1c9ee 100644
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@@ -1,13 +1,23 @@
+# encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
__all__ = [
'HTMLParserTreeBuilder',
]
-from HTMLParser import (
- HTMLParser,
- HTMLParseError,
- )
+from html.parser import HTMLParser
+
+try:
+ from html.parser import HTMLParseError
+except ImportError as e:
+ # HTMLParseError is removed in Python 3.5. Since it can never be
+ # thrown in 3.5, we can just define our own class as a placeholder.
+ class HTMLParseError(Exception):
+ pass
+
import sys
import warnings
@@ -19,10 +29,10 @@ import warnings
# At the end of this file, we monkeypatch HTMLParser so that
# strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3]
-CONSTRUCTOR_TAKES_STRICT = (
- major > 3
- or (major == 3 and minor > 2)
- or (major == 3 and minor == 2 and release >= 3))
+CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
+CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
+CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
+
from bs4.element import (
CData,
@@ -43,7 +53,42 @@ from bs4.builder import (
HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
- def handle_starttag(self, name, attrs):
+
+ def __init__(self, *args, **kwargs):
+ HTMLParser.__init__(self, *args, **kwargs)
+
+ # Keep a list of empty-element tags that were encountered
+ # without an explicit closing tag. If we encounter a closing tag
+ # of this type, we'll associate it with one of those entries.
+ #
+ # This isn't a stack because we don't care about the
+ # order. It's a list of closing tags we've already handled and
+ # will ignore, assuming they ever show up.
+ self.already_closed_empty_element = []
+
+ def error(self, msg):
+ """In Python 3, HTMLParser subclasses must implement error(), although this
+ requirement doesn't appear to be documented.
+
+ In Python 2, HTMLParser implements error() as raising an exception.
+
+ In any event, this method is called only on very strange markup and our best strategy
+ is to pretend it didn't happen and keep going.
+ """
+ warnings.warn(msg)
+
+ def handle_startendtag(self, name, attrs):
+ # This is only called when the markup looks like
+ # .
+
+ # is_startend() tells handle_starttag not to close the tag
+ # just because its name matches a known empty-element tag. We
+ # know that this is an empty-element tag and we want to call
+ # handle_endtag ourselves.
+ tag = self.handle_starttag(name, attrs, handle_empty_element=False)
+ self.handle_endtag(name)
+
+ def handle_starttag(self, name, attrs, handle_empty_element=True):
# XXX namespace
attr_dict = {}
for key, value in attrs:
@@ -53,17 +98,46 @@ class BeautifulSoupHTMLParser(HTMLParser):
value = ''
attr_dict[key] = value
attrvalue = '""'
- self.soup.handle_starttag(name, None, None, attr_dict)
+ #print "START", name
+ sourceline, sourcepos = self.getpos()
+ tag = self.soup.handle_starttag(
+ name, None, None, attr_dict, sourceline=sourceline,
+ sourcepos=sourcepos
+ )
+ if tag and tag.is_empty_element and handle_empty_element:
+ # Unlike other parsers, html.parser doesn't send separate end tag
+ # events for empty-element tags. (It's handled in
+ # handle_startendtag, but only if the original markup looked like
+ # .)
+ #
+ # So we need to call handle_endtag() ourselves. Since we
+ # know the start event is identical to the end event, we
+ # don't want handle_endtag() to cross off any previous end
+ # events for tags of this name.
+ self.handle_endtag(name, check_already_closed=False)
- def handle_endtag(self, name):
- self.soup.handle_endtag(name)
+ # But we might encounter an explicit closing tag for this tag
+ # later on. If so, we want to ignore it.
+ self.already_closed_empty_element.append(name)
+
+ def handle_endtag(self, name, check_already_closed=True):
+ #print "END", name
+ if check_already_closed and name in self.already_closed_empty_element:
+ # This is a redundant end tag for an empty-element tag.
+ # We've already called handle_endtag() for it, so just
+ # check it off the list.
+ # print "ALREADY CLOSED", name
+ self.already_closed_empty_element.remove(name)
+ else:
+ self.soup.handle_endtag(name)
def handle_data(self, data):
self.soup.handle_data(data)
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
- # it's fixed.
+ # it's fixed in all supported versions.
+ # http://bugs.python.org/issue13633
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
@@ -71,11 +145,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
else:
real_name = int(name)
- try:
- data = unichr(real_name)
- except (ValueError, OverflowError), e:
- data = u"\N{REPLACEMENT CHARACTER}"
-
+ data = None
+ if real_name < 256:
+ # HTML numeric entities are supposed to reference Unicode
+ # code points, but sometimes they reference code points in
+ # some other encoding (ahem, Windows-1252). E.g.
+ # instead of É for LEFT DOUBLE QUOTATION MARK. This
+ # code tries to detect this situation and compensate.
+ for encoding in (self.soup.original_encoding, 'windows-1252'):
+ if not encoding:
+ continue
+ try:
+ data = bytearray([real_name]).decode(encoding)
+ except UnicodeDecodeError as e:
+ pass
+ if not data:
+ try:
+ data = chr(real_name)
+ except (ValueError, OverflowError) as e:
+ pass
+ data = data or "\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def handle_entityref(self, name):
@@ -83,7 +172,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
if character is not None:
data = character
else:
- data = "&%s;" % name
+ # If this were XML, it would be ambiguous whether "&foo"
+ # was an character entity reference with a missing
+ # semicolon or the literal string "&foo". Since this is
+ # HTML, we have a complete list of all character entity references,
+ # and this one wasn't found, so assume it's the literal string "&foo".
+ data = "&%s" % name
self.handle_data(data)
def handle_comment(self, data):
@@ -113,14 +207,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
def handle_pi(self, data):
self.soup.endData()
- if data.endswith("?") and data.lower().startswith("xml"):
- # "An XHTML processing instruction using the trailing '?'
- # will cause the '?' to be included in data." - HTMLParser
- # docs.
- #
- # Strip the question mark so we don't end up with two
- # question marks.
- data = data[:-1]
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
@@ -128,26 +214,38 @@ class BeautifulSoupHTMLParser(HTMLParser):
class HTMLParserTreeBuilder(HTMLTreeBuilder):
is_xml = False
- features = [HTML, STRICT, HTMLPARSER]
+ picklable = True
+ NAME = HTMLPARSER
+ features = [NAME, HTML, STRICT]
- def __init__(self, *args, **kwargs):
- if CONSTRUCTOR_TAKES_STRICT:
- kwargs['strict'] = False
- self.parser_args = (args, kwargs)
+ # The html.parser knows which line number and position in the
+ # original file is the source of an element.
+ TRACKS_LINE_NUMBERS = True
+
+ def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
+ super(HTMLParserTreeBuilder, self).__init__(**kwargs)
+ parser_args = parser_args or []
+ parser_kwargs = parser_kwargs or {}
+ if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
+ parser_kwargs['strict'] = False
+ if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
+ parser_kwargs['convert_charrefs'] = False
+ self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
- document_declared_encoding=None):
+ document_declared_encoding=None, exclude_encodings=None):
"""
:return: A 4-tuple (markup, original encoding, encoding
declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER).
"""
- if isinstance(markup, unicode):
+ if isinstance(markup, str):
yield (markup, None, None, False)
return
try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+ dammit = UnicodeDammit(markup, try_encodings, is_html=True,
+ exclude_encodings=exclude_encodings)
yield (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding,
dammit.contains_replacement_characters)
@@ -158,10 +256,12 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup
try:
parser.feed(markup)
- except HTMLParseError, e:
+ parser.close()
+ except HTMLParseError as e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
+ parser.already_closed_empty_element = []
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like as a
diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py
index fa5d4987..f5257963 100644
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@@ -1,13 +1,26 @@
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
__all__ = [
'LXMLTreeBuilderForXML',
'LXMLTreeBuilder',
]
+try:
+ from collections.abc import Callable # Python 3.6
+except ImportError as e:
+ from collections import Callable
+
from io import BytesIO
-from StringIO import StringIO
-import collections
+from io import StringIO
from lxml import etree
-from bs4.element import Comment, Doctype, NamespacedAttribute
+from bs4.element import (
+ Comment,
+ Doctype,
+ NamespacedAttribute,
+ ProcessingInstruction,
+ XMLProcessingInstruction,
+)
from bs4.builder import (
FAST,
HTML,
@@ -20,19 +33,55 @@ from bs4.dammit import EncodingDetector
LXML = 'lxml'
+def _invert(d):
+ "Invert a dictionary."
+ return dict((v,k) for k, v in list(d.items()))
+
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
is_xml = True
+ processing_instruction_class = XMLProcessingInstruction
+
+ NAME = "lxml-xml"
+ ALTERNATE_NAMES = ["xml"]
# Well, it's permissive by XML parser standards.
- features = [LXML, XML, FAST, PERMISSIVE]
+ features = [NAME, LXML, XML, FAST, PERMISSIVE]
CHUNK_SIZE = 512
# This namespace mapping is specified in the XML Namespace
# standard.
- DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+ DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
+
+ DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
+
+ # NOTE: If we parsed Element objects and looked at .sourceline,
+ # we'd be able to see the line numbers from the original document.
+ # But instead we build an XMLParser or HTMLParser object to serve
+ # as the target of parse messages, and those messages don't include
+ # line numbers.
+
+ def initialize_soup(self, soup):
+ """Let the BeautifulSoup object know about the standard namespace
+ mapping.
+ """
+ super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
+ self._register_namespaces(self.DEFAULT_NSMAPS)
+
+ def _register_namespaces(self, mapping):
+ """Let the BeautifulSoup object know about namespaces encountered
+ while parsing the document.
+
+ This might be useful later on when creating CSS selectors.
+ """
+ for key, value in list(mapping.items()):
+ if key and key not in self.soup._namespaces:
+ # Let the BeautifulSoup object know about a new namespace.
+ # If there are multiple namespaces defined with the same
+ # prefix, the first one in the document takes precedence.
+ self.soup._namespaces[key] = value
def default_parser(self, encoding):
# This can either return a parser object or a class, which
@@ -46,12 +95,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# Use the default parser.
parser = self.default_parser(encoding)
- if isinstance(parser, collections.Callable):
+ if isinstance(parser, Callable):
# Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser
- def __init__(self, parser=None, empty_element_tags=None):
+ def __init__(self, parser=None, empty_element_tags=None, **kwargs):
# TODO: Issue a warning if parser is present but not a
# callable, since that means there's no way to create new
# parsers for different encodings.
@@ -59,8 +108,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags)
self.soup = None
- self.nsmaps = [self.DEFAULT_NSMAPS]
-
+ self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+ super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
+
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
# name. Copied from lxml's src/lxml/sax.py.
@@ -70,6 +120,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
return (None, tag)
def prepare_markup(self, markup, user_specified_encoding=None,
+ exclude_encodings=None,
document_declared_encoding=None):
"""
:yield: A series of 4-tuples.
@@ -78,31 +129,37 @@ class LXMLTreeBuilderForXML(TreeBuilder):
Each 4-tuple represents a strategy for parsing the document.
"""
- if isinstance(markup, unicode):
- # We were given Unicode. Maybe lxml can parse Unicode on
- # this system?
- yield markup, None, document_declared_encoding, False
-
- if isinstance(markup, unicode):
- # No, apparently not. Convert the Unicode to UTF-8 and
- # tell lxml to parse it as UTF-8.
- yield (markup.encode("utf8"), "utf8",
- document_declared_encoding, False)
-
# Instead of using UnicodeDammit to convert the bytestring to
# Unicode using different encodings, use EncodingDetector to
# iterate over the encodings, and tell lxml to try to parse
# the document as each one in turn.
is_html = not self.is_xml
+ if is_html:
+ self.processing_instruction_class = ProcessingInstruction
+ else:
+ self.processing_instruction_class = XMLProcessingInstruction
+
+ if isinstance(markup, str):
+ # We were given Unicode. Maybe lxml can parse Unicode on
+ # this system?
+ yield markup, None, document_declared_encoding, False
+
+ if isinstance(markup, str):
+ # No, apparently not. Convert the Unicode to UTF-8 and
+ # tell lxml to parse it as UTF-8.
+ yield (markup.encode("utf8"), "utf8",
+ document_declared_encoding, False)
+
try_encodings = [user_specified_encoding, document_declared_encoding]
- detector = EncodingDetector(markup, try_encodings, is_html)
+ detector = EncodingDetector(
+ markup, try_encodings, is_html, exclude_encodings)
for encoding in detector.encodings:
yield (detector.markup, encoding, document_declared_encoding, False)
def feed(self, markup):
if isinstance(markup, bytes):
markup = BytesIO(markup)
- elif isinstance(markup, unicode):
+ elif isinstance(markup, str):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
@@ -117,30 +174,36 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if len(data) != 0:
self.parser.feed(data)
self.parser.close()
- except (UnicodeDecodeError, LookupError, etree.ParserError), e:
- raise ParserRejectedMarkup(str(e))
+ except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
+ raise ParserRejectedMarkup(e)
def close(self):
- self.nsmaps = [self.DEFAULT_NSMAPS]
+ self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
- if len(self.nsmaps) > 1:
- # There are no new namespaces for this tag, but
- # non-default namespaces are in play, so we need a
- # separate tag stack to know when they end.
- self.nsmaps.append(None)
+ if len(nsmap) == 0 and len(self.nsmaps) > 1:
+ # There are no new namespaces for this tag, but
+ # non-default namespaces are in play, so we need a
+ # separate tag stack to know when they end.
+ self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
- inverted_nsmap = dict((value, key) for key, value in nsmap.items())
- self.nsmaps.append(inverted_nsmap)
+
+ # First, Let the BeautifulSoup object know about it.
+ self._register_namespaces(nsmap)
+
+ # Then, add it to our running list of inverted namespace
+ # mappings.
+ self.nsmaps.append(_invert(nsmap))
+
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
attrs = attrs.copy()
- for prefix, namespace in nsmap.items():
+ for prefix, namespace in list(nsmap.items()):
attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace
@@ -149,7 +212,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects.
new_attrs = {}
- for attr, value in attrs.items():
+ for attr, value in list(attrs.items()):
namespace, attr = self._getNsTag(attr)
if namespace is None:
new_attrs[attr] = value
@@ -189,7 +252,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.nsmaps.pop()
def pi(self, target, data):
- pass
+ self.soup.endData()
+ self.soup.handle_data(target + ' ' + data)
+ self.soup.endData(self.processing_instruction_class)
def data(self, content):
self.soup.handle_data(content)
@@ -207,13 +272,17 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
- return u'\n%s' % fragment
+ return '\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
- features = [LXML, HTML, FAST, PERMISSIVE]
+ NAME = LXML
+ ALTERNATE_NAMES = ["lxml-html"]
+
+ features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
is_xml = False
+ processing_instruction_class = ProcessingInstruction
def default_parser(self, encoding):
return etree.HTMLParser
@@ -224,10 +293,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
self.parser = self.parser_for(encoding)
self.parser.feed(markup)
self.parser.close()
- except (UnicodeDecodeError, LookupError, etree.ParserError), e:
- raise ParserRejectedMarkup(str(e))
+ except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
+ raise ParserRejectedMarkup(e)
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
- return u'%s' % fragment
+ return '%s' % fragment
diff --git a/lib/bs4/check_block.py b/lib/bs4/check_block.py
new file mode 100644
index 00000000..a60a7b74
--- /dev/null
+++ b/lib/bs4/check_block.py
@@ -0,0 +1,4 @@
+import requests
+data = requests.get("https://www.crummy.com/").content
+from bs4 import _s
+data = [x for x in _s(data).block_text()]
diff --git a/lib/bs4/dammit.py b/lib/bs4/dammit.py
index 32e211dc..5fc6f93a 100644
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@@ -3,12 +3,14 @@
This library converts a bytestream to Unicode through any means
necessary. It is heavily based on code from Mark Pilgrim's Universal
-Feed Parser. It works best on XML and XML, but it does not rewrite the
+Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
import codecs
-from htmlentitydefs import codepoint2name
+from html.entities import codepoint2name
import re
import logging
import string
@@ -20,6 +22,8 @@ try:
# PyPI package: cchardet
import cchardet
def chardet_dammit(s):
+ if isinstance(s, str):
+ return None
return cchardet.detect(s)['encoding']
except ImportError:
try:
@@ -28,6 +32,8 @@ except ImportError:
# PyPI package: chardet
import chardet
def chardet_dammit(s):
+ if isinstance(s, str):
+ return None
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
@@ -42,10 +48,19 @@ try:
except ImportError:
pass
-xml_encoding_re = re.compile(
- '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
-html_meta_re = re.compile(
- '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+# Build bytestring and Unicode versions of regular expressions for finding
+# a declared encoding inside an XML or HTML document.
+xml_encoding = '^\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
+html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
+encoding_res = dict()
+encoding_res[bytes] = {
+ 'html' : re.compile(html_meta.encode("ascii"), re.I),
+ 'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
+}
+encoding_res[str] = {
+ 'html' : re.compile(html_meta, re.I),
+ 'xml' : re.compile(xml_encoding, re.I)
+}
class EntitySubstitution(object):
@@ -55,15 +70,24 @@ class EntitySubstitution(object):
lookup = {}
reverse_lookup = {}
characters_for_re = []
- for codepoint, name in list(codepoint2name.items()):
- character = unichr(codepoint)
- if codepoint != 34:
+
+ # &apos is an XHTML entity and an HTML 5, but not an HTML 4
+ # entity. We don't want to use it, but we want to recognize it on the way in.
+ #
+ # TODO: Ideally we would be able to recognize all HTML 5 named
+ # entities, but that's a little tricky.
+ extra = [(39, 'apos')]
+ for codepoint, name in list(codepoint2name.items()) + extra:
+ character = chr(codepoint)
+ if codepoint not in (34, 39):
# There's no point in turning the quotation mark into
- # ", unless it happens within an attribute value, which
- # is handled elsewhere.
+ # " or the single quote into ', unless it
+ # happens within an attribute value, which is handled
+ # elsewhere.
characters_for_re.append(character)
lookup[character] = name
- # But we do want to turn " into the quotation mark.
+ # But we do want to recognize those entities on the way in and
+ # convert them to Unicode characters.
reverse_lookup[name] = character
re_definition = "[%s]" % "".join(characters_for_re)
return lookup, reverse_lookup, re.compile(re_definition)
@@ -79,7 +103,7 @@ class EntitySubstitution(object):
}
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
- "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+ "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
")")
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
@@ -212,8 +236,11 @@ class EncodingDetector:
5. Windows-1252.
"""
- def __init__(self, markup, override_encodings=None, is_html=False):
+ def __init__(self, markup, override_encodings=None, is_html=False,
+ exclude_encodings=None):
self.override_encodings = override_encodings or []
+ exclude_encodings = exclude_encodings or []
+ self.exclude_encodings = set([x.lower() for x in exclude_encodings])
self.chardet_encoding = None
self.is_html = is_html
self.declared_encoding = None
@@ -224,6 +251,8 @@ class EncodingDetector:
def _usable(self, encoding, tried):
if encoding is not None:
encoding = encoding.lower()
+ if encoding in self.exclude_encodings:
+ return False
if encoding not in tried:
tried.add(encoding)
return True
@@ -266,6 +295,9 @@ class EncodingDetector:
def strip_byte_order_mark(cls, data):
"""If a byte-order mark is present, strip it and return the encoding it implies."""
encoding = None
+ if isinstance(data, str):
+ # Unicode data cannot have a byte-order mark.
+ return data, encoding
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
and (data[2:4] != '\x00\x00'):
encoding = 'utf-16be'
@@ -300,14 +332,22 @@ class EncodingDetector:
xml_endpos = 1024
html_endpos = max(2048, int(len(markup) * 0.05))
+ if isinstance(markup, bytes):
+ res = encoding_res[bytes]
+ else:
+ res = encoding_res[str]
+
+ xml_re = res['xml']
+ html_re = res['html']
declared_encoding = None
- declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
+ declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
if not declared_encoding_match and is_html:
- declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
+ declared_encoding_match = html_re.search(markup, endpos=html_endpos)
if declared_encoding_match is not None:
- declared_encoding = declared_encoding_match.groups()[0].decode(
- 'ascii')
+ declared_encoding = declared_encoding_match.groups()[0]
if declared_encoding:
+ if isinstance(declared_encoding, bytes):
+ declared_encoding = declared_encoding.decode('ascii', 'replace')
return declared_encoding.lower()
return None
@@ -331,18 +371,19 @@ class UnicodeDammit:
]
def __init__(self, markup, override_encodings=[],
- smart_quotes_to=None, is_html=False):
+ smart_quotes_to=None, is_html=False, exclude_encodings=[]):
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
self.contains_replacement_characters = False
self.is_html = is_html
-
- self.detector = EncodingDetector(markup, override_encodings, is_html)
+ self.log = logging.getLogger(__name__)
+ self.detector = EncodingDetector(
+ markup, override_encodings, is_html, exclude_encodings)
# Short-circuit if the data is in Unicode to begin with.
- if isinstance(markup, unicode) or markup == '':
+ if isinstance(markup, str) or markup == '':
self.markup = markup
- self.unicode_markup = unicode(markup)
+ self.unicode_markup = str(markup)
self.original_encoding = None
return
@@ -365,9 +406,10 @@ class UnicodeDammit:
if encoding != "ascii":
u = self._convert_from(encoding, "replace")
if u is not None:
- logging.warning(
+ self.log.warning(
"Some characters could not be decoded, and were "
- "replaced with REPLACEMENT CHARACTER.")
+ "replaced with REPLACEMENT CHARACTER."
+ )
self.contains_replacement_characters = True
break
@@ -425,7 +467,7 @@ class UnicodeDammit:
def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
- return unicode(data, encoding, errors)
+ return str(data, encoding, errors)
@property
def declared_html_encoding(self):
@@ -723,7 +765,7 @@ class UnicodeDammit:
0xde : b'\xc3\x9e', # Þ
0xdf : b'\xc3\x9f', # ß
0xe0 : b'\xc3\xa0', # à
- 0xe1 : b'\xa1', # á
+ 0xe1 : b'\xa1', # á
0xe2 : b'\xc3\xa2', # â
0xe3 : b'\xc3\xa3', # ã
0xe4 : b'\xc3\xa4', # ä
diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py
index b7c99b1c..a1ae23dc 100644
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@@ -1,7 +1,11 @@
"""Diagnostic functions, mainly for use when doing tech support."""
+
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
import cProfile
-from StringIO import StringIO
-from HTMLParser import HTMLParser
+from io import StringIO
+from html.parser import HTMLParser
import bs4
from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry
@@ -17,8 +21,8 @@ import cProfile
def diagnose(data):
"""Diagnostic suite for isolating common problems."""
- print "Diagnostic running on Beautiful Soup %s" % __version__
- print "Python version %s" % sys.version
+ print("Diagnostic running on Beautiful Soup %s" % __version__)
+ print("Python version %s" % sys.version)
basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers:
@@ -27,44 +31,60 @@ def diagnose(data):
break
else:
basic_parsers.remove(name)
- print (
+ print((
"I noticed that %s is not installed. Installing it may help." %
- name)
+ name))
if 'lxml' in basic_parsers:
- basic_parsers.append(["lxml", "xml"])
- from lxml import etree
- print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+ basic_parsers.append("lxml-xml")
+ try:
+ from lxml import etree
+ print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
+ except ImportError as e:
+ print (
+ "lxml is not installed or couldn't be imported.")
+
if 'html5lib' in basic_parsers:
- import html5lib
- print "Found html5lib version %s" % html5lib.__version__
+ try:
+ import html5lib
+ print("Found html5lib version %s" % html5lib.__version__)
+ except ImportError as e:
+ print (
+ "html5lib is not installed or couldn't be imported.")
if hasattr(data, 'read'):
data = data.read()
- elif os.path.exists(data):
- print '"%s" looks like a filename. Reading data from the file.' % data
- data = open(data).read()
elif data.startswith("http:") or data.startswith("https:"):
- print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
- print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
+ print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
+ print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return
- print
+ else:
+ try:
+ if os.path.exists(data):
+ print('"%s" looks like a filename. Reading data from the file.' % data)
+ with open(data) as fp:
+ data = fp.read()
+ except ValueError:
+ # This can happen on some platforms when the 'filename' is
+ # too long. Assume it's data and not a filename.
+ pass
+ print()
for parser in basic_parsers:
- print "Trying to parse your markup with %s" % parser
+ print("Trying to parse your markup with %s" % parser)
success = False
try:
- soup = BeautifulSoup(data, parser)
+ soup = BeautifulSoup(data, features=parser)
success = True
- except Exception, e:
- print "%s could not parse the markup." % parser
+ except Exception as e:
+ print("%s could not parse the markup." % parser)
traceback.print_exc()
if success:
- print "Here's what %s did with the markup:" % parser
- print soup.prettify()
+ print("Here's what %s did with the markup:" % parser)
+ print(soup.prettify())
- print "-" * 80
+ print("-" * 80)
def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing.
@@ -74,7 +94,7 @@ def lxml_trace(data, html=True, **kwargs):
"""
from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
- print("%s, %4s, %s" % (event, element.tag, element.text))
+ print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else."""
@@ -135,7 +155,7 @@ def rword(length=5):
def rsentence(length=4):
"Generate a random sentence-like string."
return " ".join(rword(random.randint(4,9)) for i in range(length))
-
+
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
@@ -156,10 +176,10 @@ def rdoc(num_elements=1000):
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
- print "Comparative parser benchmark on Beautiful Soup %s" % __version__
+ print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements)
- print "Generated a large invalid HTML document (%d bytes)." % len(data)
-
+ print("Generated a large invalid HTML document (%d bytes)." % len(data))
+
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
try:
@@ -167,24 +187,24 @@ def benchmark_parsers(num_elements=100000):
soup = BeautifulSoup(data, parser)
b = time.time()
success = True
- except Exception, e:
- print "%s could not parse the markup." % parser
+ except Exception as e:
+ print("%s could not parse the markup." % parser)
traceback.print_exc()
if success:
- print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
+ print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
- print "Raw lxml parsed the markup in %.2fs." % (b-a)
+ print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
- print "Raw html5lib parsed the markup in %.2fs." % (b-a)
+ print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def profile(num_elements=100000, parser="lxml"):
diff --git a/lib/bs4/element.py b/lib/bs4/element.py
index da9afdf4..69399e5c 100644
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@@ -1,13 +1,35 @@
-import collections
+# Use of this source code is governed by the MIT license.
+__license__ = "MIT"
+
+try:
+ from collections.abc import Callable # Python 3.6
+except ImportError as e:
+ from collections import Callable
import re
import sys
import warnings
-from bs4.dammit import EntitySubstitution
+try:
+ import soupsieve
+except ImportError as e:
+ soupsieve = None
+ warnings.warn(
+ 'The soupsieve package is not installed. CSS selectors cannot be used.'
+ )
+
+from bs4.formatter import (
+ Formatter,
+ HTMLFormatter,
+ XMLFormatter,
+)
DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2)
-whitespace_re = re.compile("\s+")
+nonwhitespace_re = re.compile(r"\S+")
+
+# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
+# the off chance someone imported it for their own use.
+whitespace_re = re.compile(r"\s+")
def _alias(attr):
"""Alias one attribute name to another for backward compatibility"""
@@ -21,22 +43,27 @@ def _alias(attr):
return alias
-class NamespacedAttribute(unicode):
+class NamespacedAttribute(str):
+
+ def __new__(cls, prefix, name=None, namespace=None):
+ if not name:
+ # This is the default namespace. Its name "has no value"
+ # per https://www.w3.org/TR/xml-names/#defaulting
+ name = None
- def __new__(cls, prefix, name, namespace=None):
if name is None:
- obj = unicode.__new__(cls, prefix)
+ obj = str.__new__(cls, prefix)
elif prefix is None:
# Not really namespaced.
- obj = unicode.__new__(cls, name)
+ obj = str.__new__(cls, name)
else:
- obj = unicode.__new__(cls, prefix + ":" + name)
+ obj = str.__new__(cls, prefix + ":" + name)
obj.prefix = prefix
obj.name = name
obj.namespace = namespace
return obj
-class AttributeValueWithCharsetSubstitution(unicode):
+class AttributeValueWithCharsetSubstitution(str):
"""A stand-in object for a character encoding specified in HTML."""
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
@@ -47,7 +74,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
"""
def __new__(cls, original_value):
- obj = unicode.__new__(cls, original_value)
+ obj = str.__new__(cls, original_value)
obj.original_value = original_value
return obj
@@ -64,15 +91,15 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
The value of the 'content' attribute will be one of these objects.
"""
- CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
+ CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
def __new__(cls, original_value):
match = cls.CHARSET_RE.search(original_value)
if match is None:
# No substitution necessary.
- return unicode.__new__(unicode, original_value)
+ return str.__new__(str, original_value)
- obj = unicode.__new__(cls, original_value)
+ obj = str.__new__(cls, original_value)
obj.original_value = original_value
return obj
@@ -81,128 +108,96 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
-class HTMLAwareEntitySubstitution(EntitySubstitution):
-
- """Entity substitution rules that are aware of some HTML quirks.
-
- Specifically, the contents of
+
This numeric entity is missing the final semicolon:
+
+"""
+
class SoupTest(unittest.TestCase):
@property
def default_builder(self):
- return default_builder()
+ return default_builder
def soup(self, markup, **kwargs):
"""Build a Beautiful Soup object from markup."""
builder = kwargs.pop('builder', self.default_builder)
return BeautifulSoup(markup, builder=builder, **kwargs)
- def document_for(self, markup):
+ def document_for(self, markup, **kwargs):
"""Turn an HTML fragment into a document.
The details depend on the builder.
"""
- return self.default_builder.test_fragment_to_document(markup)
+ return self.default_builder(**kwargs).test_fragment_to_document(markup)
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
builder = self.default_builder
@@ -43,6 +85,131 @@ class SoupTest(unittest.TestCase):
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
+ def assertConnectedness(self, element):
+ """Ensure that next_element and previous_element are properly
+ set for all descendants of the given element.
+ """
+ earlier = None
+ for e in element.descendants:
+ if earlier:
+ self.assertEqual(e, earlier.next_element)
+ self.assertEqual(earlier, e.previous_element)
+ earlier = e
+
+ def linkage_validator(self, el, _recursive_call=False):
+ """Ensure proper linkage throughout the document."""
+ descendant = None
+ # Document element should have no previous element or previous sibling.
+ # It also shouldn't have a next sibling.
+ if el.parent is None:
+ assert el.previous_element is None,\
+ "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+ el, el.previous_element, None
+ )
+ assert el.previous_sibling is None,\
+ "Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+ el, el.previous_sibling, None
+ )
+ assert el.next_sibling is None,\
+ "Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
+ el, el.next_sibling, None
+ )
+
+ idx = 0
+ child = None
+ last_child = None
+ last_idx = len(el.contents) - 1
+ for child in el.contents:
+ descendant = None
+
+ # Parent should link next element to their first child
+ # That child should have no previous sibling
+ if idx == 0:
+ if el.parent is not None:
+ assert el.next_element is child,\
+ "Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
+ el, el.next_element, child
+ )
+ assert child.previous_element is el,\
+ "Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
+ child, child.previous_element, el
+ )
+ assert child.previous_sibling is None,\
+ "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
+ child, child.previous_sibling, None
+ )
+
+ # If not the first child, previous index should link as sibling to this index
+ # Previous element should match the last index or the last bubbled up descendant
+ else:
+ assert child.previous_sibling is el.contents[idx - 1],\
+ "Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
+ child, child.previous_sibling, el.contents[idx - 1]
+ )
+ assert el.contents[idx - 1].next_sibling is child,\
+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
+ )
+
+ if last_child is not None:
+ assert child.previous_element is last_child,\
+ "Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
+ child, child.previous_element, last_child, child.parent.contents
+ )
+ assert last_child.next_element is child,\
+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ last_child, last_child.next_element, child
+ )
+
+ if isinstance(child, Tag) and child.contents:
+ descendant = self.linkage_validator(child, True)
+ # A bubbled up descendant should have no next siblings
+ assert descendant.next_sibling is None,\
+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ descendant, descendant.next_sibling, None
+ )
+
+ # Mark last child as either the bubbled up descendant or the current child
+ if descendant is not None:
+ last_child = descendant
+ else:
+ last_child = child
+
+ # If last child, there are non next siblings
+ if idx == last_idx:
+ assert child.next_sibling is None,\
+ "Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ child, child.next_sibling, None
+ )
+ idx += 1
+
+ child = descendant if descendant is not None else child
+ if child is None:
+ child = el
+
+ if not _recursive_call and child is not None:
+ target = el
+ while True:
+ if target is None:
+ assert child.next_element is None, \
+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ child, child.next_element, None
+ )
+ break
+ elif target.next_sibling is not None:
+ assert child.next_element is target.next_sibling, \
+ "Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
+ child, child.next_element, target.next_sibling
+ )
+ break
+ target = target.parent
+
+ # We are done, so nothing to return
+ return None
+ else:
+ # Return the child to the recursive caller
+ return child
+
class HTMLTreeBuilderSmokeTest(object):
@@ -54,6 +221,27 @@ class HTMLTreeBuilderSmokeTest(object):
markup in these tests, there's not much room for interpretation.
"""
+ def test_empty_element_tags(self):
+ """Verify that all HTML4 and HTML5 empty element (aka void element) tags
+ are handled correctly.
+ """
+ for name in [
+ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
+ 'spacer', 'frame'
+ ]:
+ soup = self.soup("")
+ new_tag = soup.new_tag(name)
+ self.assertEqual(True, new_tag.is_empty_element)
+
+ def test_pickle_and_unpickle_identity(self):
+ # Pickling a tree, then unpickling it, yields a tree identical
+ # to the original.
+ tree = self.soup("foo")
+ dumped = pickle.dumps(tree, 2)
+ loaded = pickle.loads(dumped)
+ self.assertEqual(loaded.__class__, BeautifulSoup)
+ self.assertEqual(loaded.decode(), tree.decode())
+
def assertDoctypeHandled(self, doctype_fragment):
"""Assert that a given doctype string is handled correctly."""
doctype_str, soup = self._document_with_doctype(doctype_fragment)
@@ -114,6 +302,27 @@ class HTMLTreeBuilderSmokeTest(object):
soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b""))
+ def test_namespaced_html(self):
+ """When a namespaced XML document is parsed as HTML it should
+ be treated as HTML with weird tag names.
+ """
+ markup = b"""content"""
+ soup = self.soup(markup)
+ self.assertEqual(2, len(soup.find_all("ns1:foo")))
+
+ def test_processing_instruction(self):
+ # We test both Unicode and bytestring to verify that
+ # process_markup correctly sets processing_instruction_class
+ # even when the markup is already Unicode and there is no
+ # need to process anything.
+ markup = """"""
+ soup = self.soup(markup)
+ self.assertEqual(markup, soup.decode())
+
+ markup = b""""""
+ soup = self.soup(markup)
+ self.assertEqual(markup, soup.encode("utf8"))
+
def test_deepcopy(self):
"""Make sure you can copy the tree builder.
@@ -155,6 +364,23 @@ class HTMLTreeBuilderSmokeTest(object):
def test_nested_formatting_elements(self):
self.assertSoupEquals("")
+ def test_double_head(self):
+ html = '''
+
+
+Ordinary HEAD element test
+
+
+
+Hello, world!
+
+
+'''
+ soup = self.soup(html)
+ self.assertEqual("text/javascript", soup.find('script')['type'])
+
def test_comment(self):
# Comments are represented as Comment objects.
markup = "
foobaz
"
@@ -171,9 +397,22 @@ class HTMLTreeBuilderSmokeTest(object):
self.assertEqual(comment, baz.previous_element)
def test_preserved_whitespace_in_pre_and_textarea(self):
- """Whitespace must be preserved in
and
")
+ def test_multivalued_attribute_with_whitespace(self):
+ # Whitespace separating the values of a multi-valued attribute
+ # should be ignored.
+
+ markup = '
'
+ soup = self.soup(markup)
+ self.assertEqual(['foo', 'bar'], soup.div['class'])
+
+ # If you search by the literal name of the class it's like the whitespace
+ # wasn't there.
+ self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
+
def test_deeply_nested_multivalued_attribute(self):
# html5lib can set the attributes of the same tag many times
# as it rearranges the tree. This has caused problems with
@@ -221,18 +472,52 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup(markup)
self.assertEqual(["css"], soup.div.div['class'])
+ def test_multivalued_attribute_on_html(self):
+ # html5lib uses a different API to set the attributes ot the
+ # tag. This has caused problems with multivalued
+ # attributes.
+ markup = ''
+ soup = self.soup(markup)
+ self.assertEqual(["a", "b"], soup.html['class'])
+
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('', '')
+ def test_strings_resembling_character_entity_references(self):
+ # "&T" and "&p" look like incomplete character entities, but they are
+ # not.
+ self.assertSoupEquals(
+ "
",
+ )
+
+ def test_entities_in_foreign_document_encoding(self):
+ # and are invalid numeric entities referencing
+ # Windows-1252 characters. - references a character common
+ # to Windows-1252 and Unicode, and ☃ references a
+ # character only found in Unicode.
+ #
+ # All of these entities should be converted to Unicode
+ # characters.
+ markup = "
", expect)
@@ -243,16 +528,52 @@ class HTMLTreeBuilderSmokeTest(object):
'
I said "good day!"
')
def test_out_of_range_entity(self):
- expect = u"\N{REPLACEMENT CHARACTER}"
+ expect = "\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("", expect)
self.assertSoupEquals("", expect)
self.assertSoupEquals("", expect)
-
+
def test_multipart_strings(self):
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
soup = self.soup("
\nfoo
")
self.assertEqual("p", soup.h2.string.next_element.name)
self.assertEqual("p", soup.p.name)
+ self.assertConnectedness(soup)
+
+ def test_empty_element_tags(self):
+ """Verify consistent handling of empty-element tags,
+ no matter how they come in through the markup.
+ """
+ self.assertSoupEquals('
', "
")
+ self.assertSoupEquals('
', "
")
+
+ def test_head_tag_between_head_and_body(self):
+ "Prevent recurrence of a bug in the html5lib treebuilder."
+ content = """
+
+ foo
+
+"""
+ soup = self.soup(content)
+ self.assertNotEqual(None, soup.html.body)
+ self.assertConnectedness(soup)
+
+ def test_multiple_copies_of_a_tag(self):
+ "Prevent recurrence of a bug in the html5lib treebuilder."
+ content = """
+
+
+
+
+
+
+
+
+"""
+ soup = self.soup(content)
+ self.assertConnectedness(soup.article)
def test_basic_namespaces(self):
"""Parsers don't need to *understand* namespaces, but at the
@@ -285,9 +606,9 @@ class HTMLTreeBuilderSmokeTest(object):
# A seemingly innocuous document... but it's in Unicode! And
# it contains characters that can't be represented in the
# encoding found in the declaration! The horror!
- markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!'
+ markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!'
soup = self.soup(markup)
- self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
+ self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers."""
@@ -327,7 +648,7 @@ class HTMLTreeBuilderSmokeTest(object):
# Both XML and HTML entities are converted to Unicode characters
# during parsing.
text = "
<<sacré bleu!>>
"
- expected = u"
<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>
"
+ expected = "
<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>
"
self.assertSoupEquals(text, expected)
def test_smart_quotes_converted_on_the_way_in(self):
@@ -337,15 +658,15 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup(quote)
self.assertEqual(
soup.p.string,
- u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
+ "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
def test_non_breaking_spaces_converted_on_the_way_in(self):
soup = self.soup("")
- self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
+ self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
def test_entities_converted_on_the_way_out(self):
text = "
<<sacré bleu!>>
"
- expected = u"
<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>
".encode("utf-8")
+ expected = "
<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>
".encode("utf-8")
soup = self.soup(text)
self.assertEqual(soup.p.encode("utf-8"), expected)
@@ -354,7 +675,7 @@ class HTMLTreeBuilderSmokeTest(object):
# easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
- unicode_html = u'
Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!
'
+ unicode_html = '
Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!
'
# That's because we're going to encode it into ISO-Latin-1, and use
# that to test.
@@ -399,7 +720,9 @@ class HTMLTreeBuilderSmokeTest(object):
hebrew_document = b'Hebrew (ISO 8859-8) in Visual Directionality
Hebrew (ISO 8859-8) in Visual Directionality
\xed\xe5\xec\xf9'
soup = self.soup(
hebrew_document, from_encoding="iso8859-8")
- self.assertEqual(soup.original_encoding, 'iso8859-8')
+ # Some tree builders call it iso8859-8, others call it iso-8859-9.
+ # That's not a difference we really care about.
+ assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
self.assertEqual(
soup.encode('utf-8'),
hebrew_document.decode("iso8859-8").encode("utf-8"))
@@ -461,13 +784,39 @@ class HTMLTreeBuilderSmokeTest(object):
data.a['foo'] = 'bar'
self.assertEqual('text', data.a.decode())
+ def test_worst_case(self):
+ """Test the worst case (currently) for linking issues."""
+
+ soup = self.soup(BAD_DOCUMENT)
+ self.linkage_validator(soup)
+
+
class XMLTreeBuilderSmokeTest(object):
+ def test_pickle_and_unpickle_identity(self):
+ # Pickling a tree, then unpickling it, yields a tree identical
+ # to the original.
+ tree = self.soup("foo")
+ dumped = pickle.dumps(tree, 2)
+ loaded = pickle.loads(dumped)
+ self.assertEqual(loaded.__class__, BeautifulSoup)
+ self.assertEqual(loaded.decode(), tree.decode())
+
def test_docstring_generated(self):
soup = self.soup("")
self.assertEqual(
soup.encode(), b'\n')
+ def test_xml_declaration(self):
+ markup = b"""\n"""
+ soup = self.soup(markup)
+ self.assertEqual(markup, soup.encode("utf8"))
+
+ def test_processing_instruction(self):
+ markup = b"""\n"""
+ soup = self.soup(markup)
+ self.assertEqual(markup, soup.encode("utf8"))
+
def test_real_xhtml_document(self):
"""A real XHTML document should come out *exactly* the same as it went in."""
markup = b"""
@@ -480,12 +829,23 @@ class XMLTreeBuilderSmokeTest(object):
self.assertEqual(
soup.encode("utf-8"), markup)
+ def test_nested_namespaces(self):
+ doc = b"""
+
+
+
+
+
+"""
+ soup = self.soup(doc)
+ self.assertEqual(doc, soup.encode())
+
def test_formatter_processes_script_tag_for_xml_documents(self):
doc = """
"""
- soup = BeautifulSoup(doc, "xml")
+ soup = BeautifulSoup(doc, "lxml-xml")
# lxml would have stripped this while parsing, but we can add
# it later.
soup.script.string = 'console.log("< < hey > > ");'
@@ -493,15 +853,15 @@ class XMLTreeBuilderSmokeTest(object):
self.assertTrue(b"< < hey > >" in encoded)
def test_can_parse_unicode_document(self):
- markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!'
+ markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!'
soup = self.soup(markup)
- self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
+ self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
def test_popping_namespaced_tag(self):
markup = 'b2012-07-02T20:33:42Zcd'
soup = self.soup(markup)
self.assertEqual(
- unicode(soup.rss), markup)
+ str(soup.rss), markup)
def test_docstring_includes_correct_encoding(self):
soup = self.soup("")
@@ -532,17 +892,57 @@ class XMLTreeBuilderSmokeTest(object):
def test_closing_namespaced_tag(self):
markup = '
20010504
'
soup = self.soup(markup)
- self.assertEqual(unicode(soup.p), markup)
+ self.assertEqual(str(soup.p), markup)
def test_namespaced_attributes(self):
markup = ''
soup = self.soup(markup)
- self.assertEqual(unicode(soup.foo), markup)
+ self.assertEqual(str(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self):
markup = 'bar'
soup = self.soup(markup)
- self.assertEqual(unicode(soup.foo), markup)
+ self.assertEqual(str(soup.foo), markup)
+
+ def test_find_by_prefixed_name(self):
+ doc = """
+foo
+ bar
+ baz
+
+"""
+ soup = self.soup(doc)
+
+ # There are three tags.
+ self.assertEqual(3, len(soup.find_all('tag')))
+
+ # But two of them are ns1:tag and one of them is ns2:tag.
+ self.assertEqual(2, len(soup.find_all('ns1:tag')))
+ self.assertEqual(1, len(soup.find_all('ns2:tag')))
+
+ self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
+ self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
+
+ def test_copy_tag_preserves_namespace(self):
+ xml = """
+"""
+
+ soup = self.soup(xml)
+ tag = soup.document
+ duplicate = copy.copy(tag)
+
+ # The two tags have the same namespace prefix.
+ self.assertEqual(tag.prefix, duplicate.prefix)
+
+ def test_worst_case(self):
+ """Test the worst case (currently) for linking issues."""
+
+ soup = self.soup(BAD_DOCUMENT)
+ self.linkage_validator(soup)
+
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""
diff --git a/lib/bs4/tests/__init__.py b/lib/bs4/tests/__init__.py
new file mode 100644
index 00000000..142c8cc3
--- /dev/null
+++ b/lib/bs4/tests/__init__.py
@@ -0,0 +1 @@
+"The beautifulsoup tests."
diff --git a/lib/bs4/tests/test_builder_registry.py b/lib/bs4/tests/test_builder_registry.py
new file mode 100644
index 00000000..90cad829
--- /dev/null
+++ b/lib/bs4/tests/test_builder_registry.py
@@ -0,0 +1,147 @@
+"""Tests of the builder registry."""
+
+import unittest
+import warnings
+
+from bs4 import BeautifulSoup
+from bs4.builder import (
+ builder_registry as registry,
+ HTMLParserTreeBuilder,
+ TreeBuilderRegistry,
+)
+
+try:
+ from bs4.builder import HTML5TreeBuilder
+ HTML5LIB_PRESENT = True
+except ImportError:
+ HTML5LIB_PRESENT = False
+
+try:
+ from bs4.builder import (
+ LXMLTreeBuilderForXML,
+ LXMLTreeBuilder,
+ )
+ LXML_PRESENT = True
+except ImportError:
+ LXML_PRESENT = False
+
+
+class BuiltInRegistryTest(unittest.TestCase):
+ """Test the built-in registry with the default builders registered."""
+
+ def test_combination(self):
+ if LXML_PRESENT:
+ self.assertEqual(registry.lookup('fast', 'html'),
+ LXMLTreeBuilder)
+
+ if LXML_PRESENT:
+ self.assertEqual(registry.lookup('permissive', 'xml'),
+ LXMLTreeBuilderForXML)
+ self.assertEqual(registry.lookup('strict', 'html'),
+ HTMLParserTreeBuilder)
+ if HTML5LIB_PRESENT:
+ self.assertEqual(registry.lookup('html5lib', 'html'),
+ HTML5TreeBuilder)
+
+ def test_lookup_by_markup_type(self):
+ if LXML_PRESENT:
+ self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
+ self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
+ else:
+ self.assertEqual(registry.lookup('xml'), None)
+ if HTML5LIB_PRESENT:
+ self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
+ else:
+ self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
+
+ def test_named_library(self):
+ if LXML_PRESENT:
+ self.assertEqual(registry.lookup('lxml', 'xml'),
+ LXMLTreeBuilderForXML)
+ self.assertEqual(registry.lookup('lxml', 'html'),
+ LXMLTreeBuilder)
+ if HTML5LIB_PRESENT:
+ self.assertEqual(registry.lookup('html5lib'),
+ HTML5TreeBuilder)
+
+ self.assertEqual(registry.lookup('html.parser'),
+ HTMLParserTreeBuilder)
+
+ def test_beautifulsoup_constructor_does_lookup(self):
+
+ with warnings.catch_warnings(record=True) as w:
+ # This will create a warning about not explicitly
+ # specifying a parser, but we'll ignore it.
+
+ # You can pass in a string.
+ BeautifulSoup("", features="html")
+ # Or a list of strings.
+ BeautifulSoup("", features=["html", "fast"])
+
+ # You'll get an exception if BS can't find an appropriate
+ # builder.
+ self.assertRaises(ValueError, BeautifulSoup,
+ "", features="no-such-feature")
+
+class RegistryTest(unittest.TestCase):
+ """Test the TreeBuilderRegistry class in general."""
+
+ def setUp(self):
+ self.registry = TreeBuilderRegistry()
+
+ def builder_for_features(self, *feature_list):
+ cls = type('Builder_' + '_'.join(feature_list),
+ (object,), {'features' : feature_list})
+
+ self.registry.register(cls)
+ return cls
+
+ def test_register_with_no_features(self):
+ builder = self.builder_for_features()
+
+ # Since the builder advertises no features, you can't find it
+ # by looking up features.
+ self.assertEqual(self.registry.lookup('foo'), None)
+
+ # But you can find it by doing a lookup with no features, if
+ # this happens to be the only registered builder.
+ self.assertEqual(self.registry.lookup(), builder)
+
+ def test_register_with_features_makes_lookup_succeed(self):
+ builder = self.builder_for_features('foo', 'bar')
+ self.assertEqual(self.registry.lookup('foo'), builder)
+ self.assertEqual(self.registry.lookup('bar'), builder)
+
+ def test_lookup_fails_when_no_builder_implements_feature(self):
+ builder = self.builder_for_features('foo', 'bar')
+ self.assertEqual(self.registry.lookup('baz'), None)
+
+ def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
+ builder1 = self.builder_for_features('foo')
+ builder2 = self.builder_for_features('bar')
+ self.assertEqual(self.registry.lookup(), builder2)
+
+ def test_lookup_fails_when_no_tree_builders_registered(self):
+ self.assertEqual(self.registry.lookup(), None)
+
+ def test_lookup_gets_most_recent_builder_supporting_all_features(self):
+ has_one = self.builder_for_features('foo')
+ has_the_other = self.builder_for_features('bar')
+ has_both_early = self.builder_for_features('foo', 'bar', 'baz')
+ has_both_late = self.builder_for_features('foo', 'bar', 'quux')
+ lacks_one = self.builder_for_features('bar')
+ has_the_other = self.builder_for_features('foo')
+
+ # There are two builders featuring 'foo' and 'bar', but
+ # the one that also features 'quux' was registered later.
+ self.assertEqual(self.registry.lookup('foo', 'bar'),
+ has_both_late)
+
+ # There is only one builder featuring 'foo', 'bar', and 'baz'.
+ self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
+ has_both_early)
+
+ def test_lookup_fails_when_cannot_reconcile_requested_features(self):
+ builder1 = self.builder_for_features('foo', 'bar')
+ builder2 = self.builder_for_features('foo', 'baz')
+ self.assertEqual(self.registry.lookup('bar', 'baz'), None)
diff --git a/lib/bs4/tests/test_docs.py b/lib/bs4/tests/test_docs.py
new file mode 100644
index 00000000..5b9f6770
--- /dev/null
+++ b/lib/bs4/tests/test_docs.py
@@ -0,0 +1,36 @@
+"Test harness for doctests."
+
+# pylint: disable-msg=E0611,W0142
+
+__metaclass__ = type
+__all__ = [
+ 'additional_tests',
+ ]
+
+import atexit
+import doctest
+import os
+#from pkg_resources import (
+# resource_filename, resource_exists, resource_listdir, cleanup_resources)
+import unittest
+
+DOCTEST_FLAGS = (
+ doctest.ELLIPSIS |
+ doctest.NORMALIZE_WHITESPACE |
+ doctest.REPORT_NDIFF)
+
+
+# def additional_tests():
+# "Run the doc tests (README.txt and docs/*, if any exist)"
+# doctest_files = [
+# os.path.abspath(resource_filename('bs4', 'README.txt'))]
+# if resource_exists('bs4', 'docs'):
+# for name in resource_listdir('bs4', 'docs'):
+# if name.endswith('.txt'):
+# doctest_files.append(
+# os.path.abspath(
+# resource_filename('bs4', 'docs/%s' % name)))
+# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
+# atexit.register(cleanup_resources)
+# return unittest.TestSuite((
+# doctest.DocFileSuite(*doctest_files, **kwargs)))
diff --git a/lib/bs4/tests/test_html5lib.py b/lib/bs4/tests/test_html5lib.py
new file mode 100644
index 00000000..d7a0b298
--- /dev/null
+++ b/lib/bs4/tests/test_html5lib.py
@@ -0,0 +1,184 @@
+"""Tests to ensure that the html5lib tree builder generates good trees."""
+
+import warnings
+
+try:
+ from bs4.builder import HTML5TreeBuilder
+ HTML5LIB_PRESENT = True
+except ImportError as e:
+ HTML5LIB_PRESENT = False
+from bs4.element import SoupStrainer
+from bs4.testing import (
+ HTML5TreeBuilderSmokeTest,
+ SoupTest,
+ skipIf,
+)
+
+@skipIf(
+ not HTML5LIB_PRESENT,
+ "html5lib seems not to be present, not testing its tree builder.")
+class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
+ """See ``HTML5TreeBuilderSmokeTest``."""
+
+ @property
+ def default_builder(self):
+ return HTML5TreeBuilder
+
+ def test_soupstrainer(self):
+ # The html5lib tree builder does not support SoupStrainers.
+ strainer = SoupStrainer("b")
+ markup = "
A bold statement.
"
+ with warnings.catch_warnings(record=True) as w:
+ soup = self.soup(markup, parse_only=strainer)
+ self.assertEqual(
+ soup.decode(), self.document_for(markup))
+
+ self.assertTrue(
+ "the html5lib tree builder doesn't support parse_only" in
+ str(w[0].message))
+
+ def test_correctly_nested_tables(self):
+ """html5lib inserts tags where other parsers don't."""
+ markup = ('
\n", soup.body.decode())
+ self.assertEqual(2, len(soup.find_all('p')))
+
+ def test_reparented_markup_containing_identical_whitespace_nodes(self):
+ """Verify that we keep the two whitespace nodes in this
+ document distinct when reparenting the adjacent