From 416d646a05446de2f3644359da84a7ecb5f99776 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 20 Aug 2018 17:51:07 -0500 Subject: [PATCH 1/3] Fix encoding detection logic for pypy --- pudb/lowlevel.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/pudb/lowlevel.py b/pudb/lowlevel.py index 041dfc1..9180bf5 100644 --- a/pudb/lowlevel.py +++ b/pudb/lowlevel.py @@ -26,7 +26,7 @@ THE SOFTWARE. """ -from pudb.py3compat import PY3 +from pudb.py3compat import PY3, text_type # {{{ breakpoint validity @@ -122,7 +122,7 @@ if PY3: BOM_UTF8 = BOM_UTF8.decode() -def detect_encoding(lines): +def detect_encoding(line_iter): """ The detect_encoding() function is used to detect the encoding that should be used to decode a Python source file. It requires one argment, lines, @@ -140,11 +140,10 @@ def detect_encoding(lines): If no encoding is specified, then the default of 'utf-8' will be returned. """ bom_found = False - line_iterator = iter(lines) def read_or_stop(): try: - return next(line_iterator) + return next(line_iter) except StopIteration: return '' @@ -173,6 +172,9 @@ def detect_encoding(lines): return encoding first = read_or_stop() + if isinstance(first, text_type): + return None, [first] + if first.startswith(BOM_UTF8): bom_found = True first = first[3:] @@ -195,13 +197,18 @@ def detect_encoding(lines): def decode_lines(lines): - source_enc, _ = detect_encoding(lines) + line_iter = iter(lines) + source_enc, detection_read_lines = detect_encoding(line_iter) + + for line in detection_read_lines: + yield line - for line in lines: - if hasattr(line, "decode"): + for line in line_iter: + if hasattr(line, "decode") and source_enc is not None: yield line.decode(source_enc) else: yield line + # }}} -- GitLab From bb370fd66bbc4162893e72c06eea908eb65e39c1 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 20 Aug 2018 18:01:38 -0500 Subject: [PATCH 2/3] Fix encoding detection logic --- pudb/lowlevel.py | 15 ++++++--------- test/test_lowlevel.py | 13 ++++++++----- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/pudb/lowlevel.py b/pudb/lowlevel.py index 9180bf5..744c723 100644 --- a/pudb/lowlevel.py +++ b/pudb/lowlevel.py @@ -116,17 +116,15 @@ def lookup_module(filename): # the main idea stolen from Python 3.1's tokenize.py, by Ka-Ping Yee import re -cookie_re = re.compile("^\s*#.*coding[:=]\s*([-\w.]+)") +cookie_re = re.compile(b"^\s*#.*coding[:=]\s*([-\w.]+)") from codecs import lookup, BOM_UTF8 -if PY3: - BOM_UTF8 = BOM_UTF8.decode() def detect_encoding(line_iter): """ The detect_encoding() function is used to detect the encoding that should - be used to decode a Python source file. It requires one argment, lines, - iterable lines stream. + be used to decode a Python source file. It requires one argment, line_iter, + an iterator on the lines to be read. It will read a maximum of two lines, and return the encoding used (as a string) and a list of any lines (left as bytes) it has read @@ -159,7 +157,7 @@ def detect_encoding(line_iter): matches = cookie_re.findall(line_string) if not matches: return None - encoding = matches[0] + encoding = matches[0].decode() try: codec = lookup(encoding) except LookupError: @@ -200,10 +198,9 @@ def decode_lines(lines): line_iter = iter(lines) source_enc, detection_read_lines = detect_encoding(line_iter) - for line in detection_read_lines: - yield line + from itertools import chain - for line in line_iter: + for line in chain(detection_read_lines, line_iter): if hasattr(line, "decode") and source_enc is not None: yield line.decode(source_enc) else: diff --git a/test/test_lowlevel.py b/test/test_lowlevel.py index b7c2f3d..b753fa9 100644 --- a/test/test_lowlevel.py +++ b/test/test_lowlevel.py @@ -5,7 +5,8 @@ from pudb.py3compat import PY3 def test_detect_encoding_nocookie(): lines = ['Test Проверка'] - encoding, _ = detect_encoding(lines) + lines = [l.encode('utf-8') for l in lines] + encoding, _ = detect_encoding(iter(lines)) assert encoding == 'utf-8' @@ -15,17 +16,19 @@ def test_detect_encoding_cookie(): 'Test', 'Проверка' ] - encoding, _ = detect_encoding(lines) + lines = [l.encode('utf-8') for l in lines] + encoding, _ = detect_encoding(iter(lines)) assert encoding == 'utf-8' def test_decode_lines(): - lines = [ + unicode_lines = [ '# coding=utf-8', 'Test', 'Проверка', ] + lines = [l.encode('utf-8') for l in unicode_lines] if PY3: - assert lines == list(decode_lines(lines)) + assert unicode_lines == list(decode_lines(iter(lines))) else: - assert [l.decode('utf-8') for l in lines] == list(decode_lines(lines)) + assert [l.decode('utf-8') for l in lines] == list(decode_lines(iter(lines))) -- GitLab From f4fc96a69aebc0086239c596fe0496c4d25fa368 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 20 Aug 2018 18:15:28 -0500 Subject: [PATCH 3/3] Fix encoding tests on py2 --- test/test_lowlevel.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test/test_lowlevel.py b/test/test_lowlevel.py index b753fa9..7678823 100644 --- a/test/test_lowlevel.py +++ b/test/test_lowlevel.py @@ -4,7 +4,7 @@ from pudb.py3compat import PY3 def test_detect_encoding_nocookie(): - lines = ['Test Проверка'] + lines = [u'Test Проверка'] lines = [l.encode('utf-8') for l in lines] encoding, _ = detect_encoding(iter(lines)) assert encoding == 'utf-8' @@ -12,9 +12,9 @@ def test_detect_encoding_nocookie(): def test_detect_encoding_cookie(): lines = [ - '# coding=utf-8', - 'Test', - 'Проверка' + u'# coding=utf-8', + u'Test', + u'Проверка' ] lines = [l.encode('utf-8') for l in lines] encoding, _ = detect_encoding(iter(lines)) @@ -23,9 +23,9 @@ def test_detect_encoding_cookie(): def test_decode_lines(): unicode_lines = [ - '# coding=utf-8', - 'Test', - 'Проверка', + u'# coding=utf-8', + u'Test', + u'Проверка', ] lines = [l.encode('utf-8') for l in unicode_lines] if PY3: -- GitLab