diff --git a/pudb/lowlevel.py b/pudb/lowlevel.py index 041dfc1d43fcf9319f44269059219155cc980d06..744c723414d1576acaa094642a112a151ec211de 100644 --- a/pudb/lowlevel.py +++ b/pudb/lowlevel.py @@ -26,7 +26,7 @@ THE SOFTWARE. """ -from pudb.py3compat import PY3 +from pudb.py3compat import PY3, text_type # {{{ breakpoint validity @@ -116,17 +116,15 @@ def lookup_module(filename): # the main idea stolen from Python 3.1's tokenize.py, by Ka-Ping Yee import re -cookie_re = re.compile("^\s*#.*coding[:=]\s*([-\w.]+)") +cookie_re = re.compile(b"^\s*#.*coding[:=]\s*([-\w.]+)") from codecs import lookup, BOM_UTF8 -if PY3: - BOM_UTF8 = BOM_UTF8.decode() -def detect_encoding(lines): +def detect_encoding(line_iter): """ The detect_encoding() function is used to detect the encoding that should - be used to decode a Python source file. It requires one argment, lines, - iterable lines stream. + be used to decode a Python source file. It requires one argment, line_iter, + an iterator on the lines to be read. It will read a maximum of two lines, and return the encoding used (as a string) and a list of any lines (left as bytes) it has read @@ -140,11 +138,10 @@ def detect_encoding(lines): If no encoding is specified, then the default of 'utf-8' will be returned. """ bom_found = False - line_iterator = iter(lines) def read_or_stop(): try: - return next(line_iterator) + return next(line_iter) except StopIteration: return '' @@ -160,7 +157,7 @@ def detect_encoding(lines): matches = cookie_re.findall(line_string) if not matches: return None - encoding = matches[0] + encoding = matches[0].decode() try: codec = lookup(encoding) except LookupError: @@ -173,6 +170,9 @@ def detect_encoding(lines): return encoding first = read_or_stop() + if isinstance(first, text_type): + return None, [first] + if first.startswith(BOM_UTF8): bom_found = True first = first[3:] @@ -195,13 +195,17 @@ def detect_encoding(lines): def decode_lines(lines): - source_enc, _ = detect_encoding(lines) + line_iter = iter(lines) + source_enc, detection_read_lines = detect_encoding(line_iter) + + from itertools import chain - for line in lines: - if hasattr(line, "decode"): + for line in chain(detection_read_lines, line_iter): + if hasattr(line, "decode") and source_enc is not None: yield line.decode(source_enc) else: yield line + # }}} diff --git a/test/test_lowlevel.py b/test/test_lowlevel.py index b7c2f3d22ecf743148054f4e39d3836283847cbb..7678823e76d5992788c47ad75294694c74d12d3c 100644 --- a/test/test_lowlevel.py +++ b/test/test_lowlevel.py @@ -4,28 +4,31 @@ from pudb.py3compat import PY3 def test_detect_encoding_nocookie(): - lines = ['Test Проверка'] - encoding, _ = detect_encoding(lines) + lines = [u'Test Проверка'] + lines = [l.encode('utf-8') for l in lines] + encoding, _ = detect_encoding(iter(lines)) assert encoding == 'utf-8' def test_detect_encoding_cookie(): lines = [ - '# coding=utf-8', - 'Test', - 'Проверка' + u'# coding=utf-8', + u'Test', + u'Проверка' ] - encoding, _ = detect_encoding(lines) + lines = [l.encode('utf-8') for l in lines] + encoding, _ = detect_encoding(iter(lines)) assert encoding == 'utf-8' def test_decode_lines(): - lines = [ - '# coding=utf-8', - 'Test', - 'Проверка', + unicode_lines = [ + u'# coding=utf-8', + u'Test', + u'Проверка', ] + lines = [l.encode('utf-8') for l in unicode_lines] if PY3: - assert lines == list(decode_lines(lines)) + assert unicode_lines == list(decode_lines(iter(lines))) else: - assert [l.decode('utf-8') for l in lines] == list(decode_lines(lines)) + assert [l.decode('utf-8') for l in lines] == list(decode_lines(iter(lines)))