From 660ee1b566f379c7672894f8cd7e4dbb641893ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 10 Jan 2020 13:58:28 +0100 Subject: [PATCH 1/7] Try to fix bad JSON due to unescaped double quotes --- extruct/jsonld.py | 14 ++------------ extruct/rdfa.py | 4 ++-- extruct/utils.py | 27 +++++++++++++++++++++++++++ tests/test_utils.py | 20 ++++++++++++++++++++ 4 files changed, 51 insertions(+), 14 deletions(-) create mode 100644 tests/test_utils.py diff --git a/extruct/jsonld.py b/extruct/jsonld.py index f11580eb..e43ad9e8 100644 --- a/extruct/jsonld.py +++ b/extruct/jsonld.py @@ -4,13 +4,10 @@ """ import json -import re import lxml.etree -from extruct.utils import parse_html - -HTML_OR_JS_COMMENTLINE = re.compile(r'^\s*(//.*|)') +from extruct.utils import parse_html, parse_json class JsonLdExtractor(object): @@ -28,14 +25,7 @@ def extract_items(self, document, base_url=None): ] def _extract_items(self, node): - script = node.xpath('string()') - try: - # TODO: `strict=False` can be configurable if needed - data = json.loads(script, strict=False) - except ValueError: - # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments - data = json.loads( - HTML_OR_JS_COMMENTLINE.sub('', script), strict=False) + data = parse_json(node.xpath('string()')) if isinstance(data, list): return data elif isinstance(data, dict): diff --git a/extruct/rdfa.py b/extruct/rdfa.py index e5ab06bd..33f28ed3 100644 --- a/extruct/rdfa.py +++ b/extruct/rdfa.py @@ -14,7 +14,7 @@ from rdflib.plugins.parsers.pyRdfa import pyRdfa as PyRdfa, Options, logger as pyrdfa_logger from rdflib.plugins.parsers.pyRdfa.initialcontext import initial_context -from extruct.utils import parse_xmldom_html +from extruct.utils import parse_json, parse_xmldom_html # silence rdflib/PyRdfa INFO logs @@ -46,4 +46,4 @@ def extract_items(self, document, base_url=None, expanded=True): g = PyRdfa(options, base=base_url).graph_from_DOM(document, graph=Graph(), pgraph=Graph()) jsonld_string = g.serialize(format='json-ld', auto_compact=not expanded).decode('utf-8') - return json.loads(jsonld_string) + return parse_json(jsonld_string) diff --git a/extruct/utils.py b/extruct/utils.py index a29a61f6..25e111d9 100644 --- a/extruct/utils.py +++ b/extruct/utils.py @@ -1,4 +1,8 @@ # -*- coding: utf-8 -*- + +import json +import re + import lxml.html from extruct.xmldom import XmlDomHTMLParser @@ -10,6 +14,29 @@ def parse_html(html, encoding): return lxml.html.fromstring(html, parser=parser) +HTML_OR_JS_COMMENTLINE = re.compile(r'^\s*(//.*|)') + + +def parse_json(json_string): + try: + return json.loads(json_string, strict=False) + except ValueError: + # sometimes JSON-decoding errors are due to leading HTML or JavaScript comments + json_string = HTML_OR_JS_COMMENTLINE.sub('', json_string) + while True: + try: + return json.loads(json_string, strict=False) + except json.JSONDecodeError as error: + if error.msg == "Expecting ',' delimiter": + if json_string[error.pos-1] == '"': + insertion_position = error.pos-1 + prefix = json_string[:insertion_position] + suffix = json_string[insertion_position:] + json_string = prefix + '\\' + suffix + continue + raise + + def parse_xmldom_html(html, encoding): """ Parse HTML using XmlDomHTMLParser, return a tree """ parser = XmlDomHTMLParser(encoding=encoding) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 00000000..381bc8f5 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,20 @@ +from pytest import mark + +from extruct.utils import parse_json + + +@mark.parametrize( + 'input,output', + [ + ( + '{"a": ["10\'5""]}', + {'a': ['10\'5"']}, + ), + ( + '{"a": ["Say "Hello""]}', + {'a': ['Say "Hello"']}, + ), + ] +) +def test_parse_json(input, output): + assert parse_json(input) == output From 515eea6460fffd377c6a52afd31d993cc4c20ff7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 7 May 2020 19:27:07 +0200 Subject: [PATCH 2/7] Support Python 2 --- extruct/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/extruct/utils.py b/extruct/utils.py index 25e111d9..5137a9c8 100644 --- a/extruct/utils.py +++ b/extruct/utils.py @@ -3,6 +3,11 @@ import json import re +try: + from json.decoder import JSONDecodeError +except ImportError: + JSONDecodeError = ValueError + import lxml.html from extruct.xmldom import XmlDomHTMLParser @@ -26,7 +31,7 @@ def parse_json(json_string): while True: try: return json.loads(json_string, strict=False) - except json.JSONDecodeError as error: + except JSONDecodeError as error: if error.msg == "Expecting ',' delimiter": if json_string[error.pos-1] == '"': insertion_position = error.pos-1 From cddef0cc7422b2632a2036ae2d3a57f8d185be66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 7 May 2020 19:28:38 +0200 Subject: [PATCH 3/7] Do not be eager --- extruct/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extruct/utils.py b/extruct/utils.py index 5137a9c8..110c7d2f 100644 --- a/extruct/utils.py +++ b/extruct/utils.py @@ -19,7 +19,7 @@ def parse_html(html, encoding): return lxml.html.fromstring(html, parser=parser) -HTML_OR_JS_COMMENTLINE = re.compile(r'^\s*(//.*|)') +HTML_OR_JS_COMMENTLINE = re.compile(r'^\s*(//.*|)') def parse_json(json_string): From 6b9e5b88e58d8f18a4a6ef6d179f800778bd60ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 12 Aug 2020 14:27:46 +0200 Subject: [PATCH 4/7] Fix Python 2 support --- extruct/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extruct/utils.py b/extruct/utils.py index 110c7d2f..0bf1fa9e 100644 --- a/extruct/utils.py +++ b/extruct/utils.py @@ -32,7 +32,7 @@ def parse_json(json_string): try: return json.loads(json_string, strict=False) except JSONDecodeError as error: - if error.msg == "Expecting ',' delimiter": + if "Expecting ',' delimiter" in str(error): if json_string[error.pos-1] == '"': insertion_position = error.pos-1 prefix = json_string[:insertion_position] From ee79de5e208208830896e138b676bcdc4d412473 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 12 Aug 2020 14:40:52 +0200 Subject: [PATCH 5/7] Fix Python 2 support --- extruct/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extruct/utils.py b/extruct/utils.py index 0bf1fa9e..21da3250 100644 --- a/extruct/utils.py +++ b/extruct/utils.py @@ -32,7 +32,7 @@ def parse_json(json_string): try: return json.loads(json_string, strict=False) except JSONDecodeError as error: - if "Expecting ',' delimiter" in str(error): + if re.search("Expecting (?:','|,) delimiter", str(error)): if json_string[error.pos-1] == '"': insertion_position = error.pos-1 prefix = json_string[:insertion_position] From d0087124475589a54a58d2c54171be74b16248ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 12 Aug 2020 17:38:48 +0200 Subject: [PATCH 6/7] Make changes Python 3 only --- extruct/utils.py | 17 ++++++++++------- tests/test_utils.py | 3 +++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/extruct/utils.py b/extruct/utils.py index 21da3250..0ec8b710 100644 --- a/extruct/utils.py +++ b/extruct/utils.py @@ -32,13 +32,16 @@ def parse_json(json_string): try: return json.loads(json_string, strict=False) except JSONDecodeError as error: - if re.search("Expecting (?:','|,) delimiter", str(error)): - if json_string[error.pos-1] == '"': - insertion_position = error.pos-1 - prefix = json_string[:insertion_position] - suffix = json_string[insertion_position:] - json_string = prefix + '\\' + suffix - continue + if ( + hasattr(error, 'msg') + and error.msg == "Expecting ',' delimiter" + and json_string[error.pos-1] == '"' + ): + insertion_position = error.pos-1 + prefix = json_string[:insertion_position] + suffix = json_string[insertion_position:] + json_string = prefix + '\\' + suffix + continue raise diff --git a/tests/test_utils.py b/tests/test_utils.py index 381bc8f5..1c669cee 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,8 +1,11 @@ +from sys import version_info + from pytest import mark from extruct.utils import parse_json +@mark.skipif(version_info < (3,), reason="requires Python 3") @mark.parametrize( 'input,output', [ From 96c77af4e47929d9d2674aba41f3b974ecd03200 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 12 Aug 2020 18:17:32 +0200 Subject: [PATCH 7/7] Test Python 2 scenario --- tests/test_utils.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 1c669cee..a3a3988c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,11 +1,10 @@ from sys import version_info -from pytest import mark +from pytest import mark, raises from extruct.utils import parse_json -@mark.skipif(version_info < (3,), reason="requires Python 3") @mark.parametrize( 'input,output', [ @@ -20,4 +19,8 @@ ] ) def test_parse_json(input, output): - assert parse_json(input) == output + if version_info >= (3,): + assert parse_json(input) == output + else: + with raises(ValueError): + parse_json(input)