subtree(users/wpcarro): docking briefcase at '24f5a642'
git-subtree-dir: users/wpcarro git-subtree-mainline:464bbcb15cgit-subtree-split:24f5a642afChange-Id: I6105b3762b79126b3488359c95978cadb3efa789
This commit is contained in:
commit
019f8fd211
766 changed files with 175420 additions and 0 deletions
121
users/wpcarro/scratch/facebook/parsing/json.py
Normal file
121
users/wpcarro/scratch/facebook/parsing/json.py
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
from parser import Parser
|
||||
|
||||
# As an exercise to stress-test my understanding of recursive descent parsers,
|
||||
# I'm attempting to write a JSON parser without referencing any existing BNF
|
||||
# descriptions of JSON or existing JSON parser implementations.
|
||||
#
|
||||
# I'm only parsing a subset of JSON: enough to parse `sample`. Here is the BNF
|
||||
# that I wrote to describe my expected input:
|
||||
#
|
||||
# expression -> object
|
||||
# object -> '{' ( STRING ':' expression ) ( ',' STRING ':' expression )* '}'
|
||||
# | array
|
||||
# array -> '[' expression ( ',' expression )* ']'
|
||||
# | literal
|
||||
# literal -> STRING | INT
|
||||
|
||||
def tokenize(xs):
|
||||
"""
|
||||
Return a list of tokens from the string input, `xs`.
|
||||
"""
|
||||
result = []
|
||||
i = 0
|
||||
while i < len(xs):
|
||||
# single characters
|
||||
if xs[i] in ",{}:[]":
|
||||
result.append(xs[i])
|
||||
i += 1
|
||||
# strings
|
||||
elif xs[i] == "\"":
|
||||
curr = xs[i]
|
||||
i += 1
|
||||
while xs[i] != "\"":
|
||||
curr += xs[i]
|
||||
i += 1
|
||||
curr += xs[i]
|
||||
result.append(curr)
|
||||
i += 1
|
||||
# integers
|
||||
elif xs[i] in "0123456789":
|
||||
curr = xs[i]
|
||||
i += 1
|
||||
while xs[i] in "0123456789":
|
||||
curr += xs[i]
|
||||
i += 1
|
||||
result.append(int(curr))
|
||||
# whitespace
|
||||
elif xs[i] in {" ", "\n"}:
|
||||
i += 1
|
||||
return result
|
||||
|
||||
def parse_json(x):
|
||||
"""
|
||||
Attempt to parse the string, `x`, into JSON.
|
||||
"""
|
||||
tokens = tokenize(x)
|
||||
return parse_object(Parser(tokens))
|
||||
|
||||
def parse_object(parser):
|
||||
if parser.match(['{']):
|
||||
key = parse_string(parser)
|
||||
parser.expect([':'])
|
||||
value = parse_object(parser)
|
||||
result = [(key, value)]
|
||||
while parser.match([',']):
|
||||
key = parse_string(parser)
|
||||
parser.match([':'])
|
||||
value = parse_object(parser)
|
||||
result.append((key, value))
|
||||
return result
|
||||
return parse_array(parser)
|
||||
|
||||
def parse_array(parser):
|
||||
if parser.match(['[']):
|
||||
if parser.match([']']):
|
||||
return []
|
||||
result = [parse_object(parser)]
|
||||
while parser.match([',']):
|
||||
result.append(parse_object(parser))
|
||||
parser.expect([']'])
|
||||
return result
|
||||
else:
|
||||
return parse_literal(parser)
|
||||
|
||||
def parse_string(parser):
|
||||
if parser.curr().startswith("\""):
|
||||
return parser.consume()
|
||||
else:
|
||||
raise Exception("Unexpected token: {}".format(parser.curr()))
|
||||
|
||||
def parse_literal(parser):
|
||||
return parser.consume()
|
||||
|
||||
sample = """
|
||||
{
|
||||
"glossary": {
|
||||
"title": "example glossary",
|
||||
"GlossDiv": {
|
||||
"title": "S",
|
||||
"GlossList": {
|
||||
"GlossEntry": {
|
||||
"ID": "SGML",
|
||||
"SortAs": "SGML",
|
||||
"GlossTerm": "Standard Generalized Markup Language",
|
||||
"Acronym": "SGML",
|
||||
"Abbrev": "ISO 8879:1986",
|
||||
"GlossDef": {
|
||||
"para": "A meta-markup language, used to create markup languages such as DocBook.",
|
||||
"GlossSeeAlso": [
|
||||
"GML",
|
||||
"XML"
|
||||
]
|
||||
},
|
||||
"GlossSee": "markup"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
print(parse_json(sample))
|
||||
Loading…
Add table
Add a link
Reference in a new issue