Spaces:
Build error
Build error
File size: 22,406 Bytes
01523b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 |
# jsonrepair.py - Repair invalid JSON documents in Python
#
# Just https://github.com/josdejong/jsonrepair ported from TypeScript to Python.
#
# This port won't get updates, because the goal should be to generate this library instead.
#
# See: https://github.com/josdejong/jsonrepair/issues/84
#
import json
import re
from typing import Optional
CONTROL_CHARACTERS = {"\b": "\\b", "\f": "\\f", "\n": "\\n", "\r": "\\r", "\t": "\\t"}
ESCAPE_CHARACTERS = {
'"': '"',
"\\": "\\",
"/": "/",
"b": "\b",
"f": "\f",
"n": "\n",
"r": "\r",
"t": "\t"
# note that \u is handled separately in parseString()
}
def remove_at_index(text: str, start: int, count: int) -> str:
return text[0:start] + text[start + count :]
def is_control_character(char: str) -> bool:
return char in CONTROL_CHARACTERS
def is_valid_string_character(char: str) -> bool:
return 0x20 <= ord(char) <= 0x10FFFF
def is_quote(char: str) -> bool:
return is_single_quote(char) or is_double_quote(char)
def is_single_quote(char: str) -> bool:
"""Test whether the given character is a single quote character.
Also tests for special variants of single quotes.
"""
return char in (
"'", # U+0027
"‘", # U+2018
"’", # U+2019
"`", # U+0060
"´", # U+00B4
)
def is_double_quote(char: str) -> bool:
return (
is_ascii_double_quote(char)
or is_double_quote_left(char)
or is_double_quote_right(char)
)
def is_ascii_double_quote(char: str) -> bool:
return char == '"' # U+0022
def is_double_quote_left(char: str) -> bool:
return char == "“" # U+201C
def is_double_quote_right(char: str) -> bool:
return char == "”" # U+201D
def is_start_of_value(char: str) -> bool:
regex_start_of_value = (
r"^[[{\w-]$" # alpha, number, minus, or opening bracket or brace
)
return bool(re.search(regex_start_of_value, char)) or is_quote(char)
def ends_with_comma_or_newline(text: str) -> bool:
return bool(re.search(r"[,\n][ \t\r]*$", text))
def is_whitespace(char: str) -> bool:
return char.isspace()
def is_special_whitespace(char: str) -> bool:
"""Check if the given character is a special whitespace character, some unicode variant"""
return (
char == "\u00A0" # non-breaking space
or ord("\u2000") <= ord(char) <= ord("\u200A")
or char == "\u202F"
or char == "\u205F"
or char == "\u3000"
)
def insert_before_last_whitespace(text: str, text_to_insert: str) -> str:
index = len(text)
if not is_whitespace(text[index - 1]):
# no trailing whitespaces
return text + text_to_insert
while is_whitespace(text[index - 1]):
index -= 1
return text[:index] + text_to_insert + text[index:]
def strip_last_occurrence(
text: str, text_to_strip: str, strip_remaining: bool = False
) -> str:
index = text.rindex(text_to_strip)
try:
return text[:index] + ("" if strip_remaining else text[index + 1 :])
except ValueError:
return text
def is_hex(char: str) -> bool:
try:
int(char, 16)
return True
except ValueError:
return False
def is_delimiter(char: str) -> bool:
return char in ",:[]{}()\n'" or is_quote(char)
def at_end_of_block_comment(text: str, i: int) -> bool:
return text[i] == "*" and text[i + 1] == "/"
class JsonRepairError(Exception):
def __init__(self, message: str, position: int):
super(JsonRepairError, self).__init__(message + f" at position {position}")
self.position = position
class JsonRepair:
"""Repairs invalid JSON, i.e. change JavaScript notation into JSON notation.
Example:
try:
json = "{name: 'John'}"
repaired = JsonRepair(json).repair()
print(repaired)
# '{"name": "John"}'
except JsonRepairFailed as err:
print(err)
"""
def __init__(self, text: str):
self.text = text
self.i = 0 # current index in text
self.output = "" # generated output
def char(self, pos: int = 0) -> str:
return self.text[self.i + pos]
def inc(self, by: int = 1) -> None:
self.i += by
def dec(self, by: int = 1) -> None:
self.i -= by
def is_start_of_document(self, pos: int = 0) -> bool:
return self.i + pos == 0
def is_end_of_document(self, pos: int = 0) -> bool:
return self.i + pos >= len(self.text)
def repair(self) -> str:
processed = self.parse_value()
if not processed:
raise self.unexpected_end()
processed_comma = self.parse_character(",")
if processed_comma:
self.parse_whitespace_and_skip_comments()
if (
not self.is_end_of_document()
and is_start_of_value(self.char())
and ends_with_comma_or_newline(self.output)
):
# start of a new value after end of the root level object: looks like
# newline delimited JSON -> turn into a root level array
if not processed_comma:
# repair missing comma
self.output = insert_before_last_whitespace(self.output, ",")
self.parse_newline_delimited_json()
elif processed_comma:
# repair: remove trailing comma
self.output = strip_last_occurrence(self.output, ",")
if self.is_end_of_document():
# reached the end of the document properly
return self.output
raise self.unexpected_character()
def parse_value(self) -> bool:
self.parse_whitespace_and_skip_comments()
processed = (
self.parse_object()
or self.parse_array()
or self.parse_string()
or self.parse_number()
or self.parse_keywords()
or self.parse_unquoted_string()
)
self.parse_whitespace_and_skip_comments()
return processed
def parse_whitespace_and_skip_comments(self) -> bool:
start = self.i
changed = self.parse_whitespace()
while True:
changed = self.parse_comment()
if changed:
changed = self.parse_whitespace()
if not changed:
break
return self.i > start
def parse_whitespace(self) -> bool:
whitespace = ""
while not self.is_end_of_document():
char = self.char()
normal = is_whitespace(char)
special = is_special_whitespace(char)
if not normal and not special:
break
if special:
whitespace += " " # repair special whitespace
else:
whitespace += char
self.inc()
if whitespace:
self.output += whitespace
return True
return False
def parse_comment(self) -> bool:
# find a block comment '/* ... */'
if not self.is_end_of_document() and not self.is_end_of_document(pos=+1):
if self.char() == "/" and self.char(pos=+1) == "*":
# repair block comment by skipping it
while not self.is_end_of_document() and not at_end_of_block_comment(
self.text, self.i
):
self.inc()
self.inc(by=2)
return True
# find a line comment '// ...'
if self.char() == "/" and self.char(pos=+1) == "/":
# repair line comment by skipping it
while not self.is_end_of_document() and self.char() != "\n":
self.inc()
return True
return False
def parse_character(self, char: str) -> bool:
if not self.is_end_of_document():
if self.char() == char:
self.output += char
self.inc()
return True
return False
def skip_character(self, char: str) -> bool:
if not self.is_end_of_document() and self.char() == char:
self.inc()
return True
return False
def skip_escape_character(self) -> bool:
return self.skip_character("\\")
def parse_object(self) -> bool:
"""Parse an object like '{"key": "value"}'"""
if not self.is_end_of_document() and self.char() == "{":
self.output += "{"
self.inc()
self.parse_whitespace_and_skip_comments()
initial = True
while not self.is_end_of_document() and self.char() != "}":
if not initial:
processed_comma = self.parse_character(",")
if not processed_comma:
# repair missing comma
self.output = insert_before_last_whitespace(self.output, ",")
self.parse_whitespace_and_skip_comments()
else:
processed_comma = True
initial = False
processed_key = self.parse_string() or self.parse_unquoted_string()
if not processed_key:
if self.is_end_of_document() or self.char() in "{}[]":
# repair trailing comma
self.output = strip_last_occurrence(self.output, ",")
break
raise self.object_key_expected()
self.parse_whitespace_and_skip_comments()
processed_colon = self.parse_character(":")
if not processed_colon:
if is_start_of_value(self.char()):
# repair missing colon
self.output = insert_before_last_whitespace(self.output, ":")
else:
raise self.colon_expected()
processed_value = self.parse_value()
if not processed_value:
if processed_colon:
raise self.object_value_expected()
raise self.colon_expected()
if not self.is_end_of_document() and self.char() == "}":
self.output += "}"
self.inc()
else:
# repair missing end bracket
self.output = insert_before_last_whitespace(self.output, "}")
return True
return False
def parse_array(self) -> bool:
"""Parse an array like '["item1", "item2", ...]'"""
if not self.is_end_of_document() and self.char() == "[":
self.output += "["
self.inc()
self.parse_whitespace_and_skip_comments()
initial = True
while not self.is_end_of_document() and self.char() != "]":
if not initial:
processed_comma = self.parse_character(",")
if not processed_comma:
# repair missing comma
self.output = insert_before_last_whitespace(self.output, ",")
else:
initial = False
processed_value = self.parse_value()
if not processed_value:
# repair trailing comma
self.output = strip_last_occurrence(self.output, ",")
break
if not self.is_end_of_document() and self.char() == "]":
self.output += "]"
self.inc()
else:
# repair missing closing array bracket
self.output = insert_before_last_whitespace(self.output, "]")
return True
return False
def parse_newline_delimited_json(self):
"""Parse and repair Newline Delimited JSON (NDJSON):
multiple JSON objects separated by a newline character
"""
# repair NDJSON
initial = True
processed_value = True
while processed_value:
if not initial:
# parse optional comma, insert when missing
processed_comma = self.parse_character(",")
if not processed_comma:
# repair: add missing comma
self.output = insert_before_last_whitespace(self.output, ",")
else:
initial = False
processed_value = self.parse_value()
if not processed_value:
# repair: remove trailing comma
self.output = strip_last_occurrence(self.output, ",")
# repair: wrap the output inside array brackets
self.output = f"[\n{self.output}\n]"
def parse_string(self) -> bool:
"""Parse a string enclosed by double quotes "...". Can contain escaped quotes
Repair strings enclosed in single quotes or special quotes
Repair an escaped string
"""
if not self.is_end_of_document():
skip_escape_chars = self.char() == "\\"
if skip_escape_chars:
# repair: remove the first escape character
self.inc()
skip_escape_chars = True
if not self.is_end_of_document() and is_quote(self.char()):
is_end_quote = (
is_single_quote if is_single_quote(self.char()) else is_double_quote
)
if self.char() != '"':
pass # TODO?: repair non-normalized quote
self.output += '"'
self.inc()
while not self.is_end_of_document() and not is_end_quote(self.char()):
if self.char() == "\\":
char = self.char(pos=+1)
escape_char = ESCAPE_CHARACTERS.get(char)
if escape_char:
self.output += self.text[self.i : self.i + 2]
self.inc(by=2)
elif char == "u":
if (
not self.is_end_of_document(pos=+5)
and is_hex(self.char(pos=+2))
and is_hex(self.char(pos=+3))
and is_hex(self.char(pos=+4))
and is_hex(self.char(pos=+5))
):
self.output += self.text[self.i : self.i + 6]
self.inc(by=6)
else:
raise self.invalid_unicode_character(self.i)
else:
# repair invalid escape character: remove it
self.output += char
self.inc(by=2)
else:
char = self.char()
if char == '"' and self.char(pos=-1) != "\\":
# repair unescaped double quote
self.output += "\\" + char
self.inc()
elif is_control_character(char):
# unescaped control character
self.output += CONTROL_CHARACTERS[char]
self.inc()
else:
if not is_valid_string_character(char):
raise self.invalid_character(char)
self.output += char
self.inc()
if skip_escape_chars:
processed = self.skip_escape_character()
if processed:
pass # repair: skipped escape character (nothing to do)
if not self.is_end_of_document() and is_quote(self.char()):
if self.char() != '"':
pass # TODO:? repair non-normalized quote
self.output += '"'
self.inc()
else:
# repair missing end quote
self.output += '"'
self.parse_concatenated_string()
return True
return False
def parse_concatenated_string(self) -> bool:
"""Repair concatenated strings like \"hello\" + \"world\", change this into \"helloworld\" """
processed = False
self.parse_whitespace_and_skip_comments()
while not self.is_end_of_document() and self.char() == "+":
processed = True
self.inc()
self.parse_whitespace_and_skip_comments()
# repair: remove the end quote of the first string
self.output = strip_last_occurrence(self.output, '"', True)
start = len(self.output)
self.parse_string()
# repair: remove the start quote of the second string
self.output = remove_at_index(self.output, start, 1)
return processed
def parse_number(self) -> bool:
"""Parse a number like 2.4 or 2.4e6"""
if not self.is_end_of_document():
start = self.i
if self.char() == "-":
self.inc()
err = self.expect_digit(start)
if err:
raise err
if not self.is_end_of_document() and self.char() == "0":
self.inc()
elif not self.is_end_of_document() and self.char() in "123456789":
self.inc()
while not self.is_end_of_document() and self.char().isdigit():
self.inc()
if not self.is_end_of_document() and self.char() == ".":
self.inc()
err = self.expect_digit(start)
if err:
raise err
while not self.is_end_of_document() and self.char().isdigit():
self.inc()
if not self.is_end_of_document() and self.char() in "eE":
self.inc()
if not self.is_end_of_document() and self.char() in "+-":
self.inc()
err = self.expect_digit(start)
if err:
raise err
while not self.is_end_of_document() and self.char().isdigit():
self.inc()
if self.i > start:
self.output += self.text[start : self.i]
return True
return False
def parse_keywords(self) -> bool:
"""Parse keywords true, false, null
Repair Python keywords True, False, None
"""
return (
self.parse_keyword("true", "true")
or self.parse_keyword("false", "false")
or self.parse_keyword("null", "null")
# repair Python keywords True, False, None
or self.parse_keyword("True", "true")
or self.parse_keyword("False", "false")
or self.parse_keyword("None", "null")
)
def parse_keyword(self, name: str, value: str) -> bool:
if self.text[self.i : self.i + len(name)] == name:
self.output += value
self.inc(by=len(name))
return True
return False
def parse_unquoted_string(self) -> bool:
"""Repair and unquoted string by adding quotes around it
Repair a MongoDB function call like NumberLong("2")
Repair a JSONP function call like callback({...});
"""
# note that the symbol can end with whitespaces: we stop at the next delimiter
start = self.i
while not self.is_end_of_document() and not is_delimiter(self.char()):
self.inc()
if self.i > start:
if not self.is_end_of_document() and self.char() == "(":
# repair a MongoDB function call like NumberLong("2")
# repair a JSONP function call like callback({...});
self.inc()
self.parse_value()
if not self.is_end_of_document() and self.char() == ")":
# repair: skip close bracket of function call
self.inc()
if not self.is_end_of_document() and self.char() == ";":
# repair: skip semicolon after JSONP call
self.inc()
return True
# else repair unquoted string
# first, go back to prevent getting trailing whitespaces in the string
while not self.is_start_of_document() and is_whitespace(self.char(pos=-1)):
self.dec()
symbol = self.text[start : self.i]
self.output += json.dumps(symbol)
return True
return False
def expect_digit(self, start: int) -> Optional[JsonRepairError]:
if self.is_end_of_document() or not self.char().isdigit():
num_so_far = self.text[start : self.i]
return JsonRepairError(
f"Invalid number '{num_so_far}', expecting a digit {self.got()}", 2
)
def invalid_character(self, char: str) -> JsonRepairError:
return JsonRepairError("Invalid character " + json.dumps(char), self.i)
def unexpected_character(self) -> JsonRepairError:
return JsonRepairError(
"Unexpected character " + json.dumps(self.text[self.i]), self.i
)
def unexpected_end(self) -> JsonRepairError:
return JsonRepairError("Unexpected end of json string", len(self.text))
def object_key_expected(self) -> JsonRepairError:
return JsonRepairError("Object key expected", self.i)
def object_value_expected(self) -> JsonRepairError:
return JsonRepairError("Object value expected", self.i)
def colon_expected(self) -> JsonRepairError:
return JsonRepairError("Colon expected", self.i)
def invalid_unicode_character(self, start: int) -> JsonRepairError:
end = start + 2
while re.match(r"\w", self.text[end]):
end += 1
chars = self.text[start:end]
return JsonRepairError(f'Invalid unicode character "{chars}"', self.i)
def got(self) -> str:
return (
f"but got '{self.char()}'"
if not self.is_end_of_document()
else "but reached end of input"
)
|