121 lines
		
	
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			121 lines
		
	
	
	
		
			3.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
from parser import Parser
 | 
						|
 | 
						|
# As an exercise to stress-test my understanding of recursive descent parsers,
 | 
						|
# I'm attempting to write a JSON parser without referencing any existing BNF
 | 
						|
# descriptions of JSON or existing JSON parser implementations.
 | 
						|
#
 | 
						|
# I'm only parsing a subset of JSON: enough to parse `sample`. Here is the BNF
 | 
						|
# that I wrote to describe my expected input:
 | 
						|
#
 | 
						|
# expression -> object
 | 
						|
# object     -> '{' ( STRING ':' expression ) ( ',' STRING ':' expression )* '}'
 | 
						|
#            |  array
 | 
						|
# array      -> '[' expression ( ',' expression )* ']'
 | 
						|
#            |  literal
 | 
						|
# literal    -> STRING | INT
 | 
						|
 | 
						|
def tokenize(xs):
 | 
						|
    """
 | 
						|
    Return a list of tokens from the string input, `xs`.
 | 
						|
    """
 | 
						|
    result = []
 | 
						|
    i = 0
 | 
						|
    while i < len(xs):
 | 
						|
        # single characters
 | 
						|
        if xs[i] in ",{}:[]":
 | 
						|
            result.append(xs[i])
 | 
						|
            i += 1
 | 
						|
        # strings
 | 
						|
        elif xs[i] == "\"":
 | 
						|
            curr = xs[i]
 | 
						|
            i += 1
 | 
						|
            while xs[i] != "\"":
 | 
						|
                curr += xs[i]
 | 
						|
                i += 1
 | 
						|
            curr += xs[i]
 | 
						|
            result.append(curr)
 | 
						|
            i += 1
 | 
						|
        # integers
 | 
						|
        elif xs[i] in "0123456789":
 | 
						|
            curr = xs[i]
 | 
						|
            i += 1
 | 
						|
            while xs[i] in "0123456789":
 | 
						|
                curr += xs[i]
 | 
						|
                i += 1
 | 
						|
            result.append(int(curr))
 | 
						|
        # whitespace
 | 
						|
        elif xs[i] in {" ", "\n"}:
 | 
						|
            i += 1
 | 
						|
    return result
 | 
						|
 | 
						|
def parse_json(x):
 | 
						|
    """
 | 
						|
    Attempt to parse the string, `x`, into JSON.
 | 
						|
    """
 | 
						|
    tokens = tokenize(x)
 | 
						|
    return parse_object(Parser(tokens))
 | 
						|
 | 
						|
def parse_object(parser):
 | 
						|
    if parser.match(['{']):
 | 
						|
        key = parse_string(parser)
 | 
						|
        parser.expect([':'])
 | 
						|
        value = parse_object(parser)
 | 
						|
        result = [(key, value)]
 | 
						|
        while parser.match([',']):
 | 
						|
            key = parse_string(parser)
 | 
						|
            parser.match([':'])
 | 
						|
            value = parse_object(parser)
 | 
						|
            result.append((key, value))
 | 
						|
        return result
 | 
						|
    return parse_array(parser)
 | 
						|
 | 
						|
def parse_array(parser):
 | 
						|
    if parser.match(['[']):
 | 
						|
        if parser.match([']']):
 | 
						|
            return []
 | 
						|
        result = [parse_object(parser)]
 | 
						|
        while parser.match([',']):
 | 
						|
            result.append(parse_object(parser))
 | 
						|
        parser.expect([']'])
 | 
						|
        return result
 | 
						|
    else:
 | 
						|
        return parse_literal(parser)
 | 
						|
 | 
						|
def parse_string(parser):
 | 
						|
    if parser.curr().startswith("\""):
 | 
						|
        return parser.consume()
 | 
						|
    else:
 | 
						|
        raise Exception("Unexpected token: {}".format(parser.curr()))
 | 
						|
 | 
						|
def parse_literal(parser):
 | 
						|
    return parser.consume()
 | 
						|
 | 
						|
sample = """
 | 
						|
{
 | 
						|
  "glossary": {
 | 
						|
    "title": "example glossary",
 | 
						|
    "GlossDiv": {
 | 
						|
      "title": "S",
 | 
						|
      "GlossList": {
 | 
						|
        "GlossEntry": {
 | 
						|
          "ID": "SGML",
 | 
						|
          "SortAs": "SGML",
 | 
						|
          "GlossTerm": "Standard Generalized Markup Language",
 | 
						|
          "Acronym": "SGML",
 | 
						|
          "Abbrev": "ISO 8879:1986",
 | 
						|
          "GlossDef": {
 | 
						|
            "para": "A meta-markup language, used to create markup languages such as DocBook.",
 | 
						|
            "GlossSeeAlso": [
 | 
						|
              "GML",
 | 
						|
              "XML"
 | 
						|
            ]
 | 
						|
          },
 | 
						|
          "GlossSee": "markup"
 | 
						|
        }
 | 
						|
      }
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
"""
 | 
						|
 | 
						|
print(parse_json(sample))
 |