7.9
  #   ˆl–ßi~”Ja,jzãb¸&ÔÅ£Ð„™}ÏÃÎ 0l    # -*- coding: utf-8 -*-
# Copyright (c) 2011 Red Hat, Inc
# Copyright (c) 2010 Seth Vidal
#
# kitchen is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# kitchen is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with kitchen; if not, see <http://www.gnu.org/licenses/>
#
# Authors:
#   James Antill
#   Toshio Kuratomi <toshio@fedoraproject.org>
#   Seth Vidal
#
# Portions of this code taken from yum/misc.py and yum/i18n.py
'''
---------------------------------------------
Miscellaneous functions for manipulating text
---------------------------------------------

Collection of text functions that don't fit in another category.
'''
import htmlentitydefs
import itertools
import re

try:
    import chardet
except ImportError:
    chardet = None

# We need to access b_() for localizing our strings but we'll end up with
# a circular import if we import it directly.
import kitchen as k
from kitchen.pycompat24 import sets
from kitchen.text.exceptions import ControlCharError

sets.add_builtin_set()

# Define a threshold for chardet confidence.  If we fall below this we decode
# byte strings we're guessing about as latin1
_CHARDET_THRESHHOLD = 0.6

# ASCII control codes that are illegal in xml 1.0
_CONTROL_CODES = frozenset(range(0, 8) + [11, 12] + range(14, 32))
_CONTROL_CHARS = frozenset(itertools.imap(unichr, _CONTROL_CODES))

# _ENTITY_RE
_ENTITY_RE = re.compile(r'(?s)<[^>]*>|&#?\w+;')

def guess_encoding(byte_string, disable_chardet=False):
    '''Try to guess the encoding of a byte :class:`str`

    :arg byte_string: byte :class:`str` to guess the encoding of
    :kwarg disable_chardet: If this is True, we never attempt to use
        :mod:`chardet` to guess the encoding.  This is useful if you need to
        have reproducibility whether :mod:`chardet` is installed or not.
        Default: :data:`False`.
    :raises TypeError: if :attr:`byte_string` is not a byte :class:`str` type
    :returns: string containing a guess at the encoding of
        :attr:`byte_string`.  This is appropriate to pass as the encoding
        argument when encoding and decoding unicode strings.

    We start by attempting to decode the byte :class:`str` as :term:`UTF-8`.
    If this succeeds we tell the world it's :term:`UTF-8` text.  If it doesn't
    and :mod:`chardet` is installed on the system and :attr:`disable_chardet`
    is False this function will use it to try detecting the encoding of
    :attr:`byte_string`.  If it is not installed or :mod:`chardet` cannot
    determine the encoding with a high enough confidence then we rather
    arbitrarily claim that it is ``latin-1``.  Since ``latin-1`` will encode
    to every byte, decoding from ``latin-1`` to :class:`unicode` will not
    cause :exc:`UnicodeErrors` although the output might be mangled.
    '''
    if not isinstance(byte_string, str):
        raise TypeError(k.b_('byte_string must be a byte string (str)'))
    input_encoding = 'utf-8'
    try:
        unicode(byte_string, input_encoding, 'strict')
    except UnicodeDecodeError:
        input_encoding = None

    if not input_encoding and chardet and not disable_chardet:
        detection_info = chardet.detect(byte_string)
        if detection_info['confidence'] >= _CHARDET_THRESHHOLD:
            input_encoding = detection_info['encoding']

    if not input_encoding:
        input_encoding = 'latin-1'

    return input_encoding

def str_eq(str1, str2, encoding='utf-8', errors='replace'):
    '''Compare two stringsi, converting to byte :class:`str` if one is
    :class:`unicode`

    :arg str1: First string to compare