subtree(users/wpcarro): docking briefcase at '24f5a642'
git-subtree-dir: users/wpcarro git-subtree-mainline:464bbcb15cgit-subtree-split:24f5a642afChange-Id: I6105b3762b79126b3488359c95978cadb3efa789
This commit is contained in:
commit
019f8fd211
766 changed files with 175420 additions and 0 deletions
121
users/wpcarro/scratch/facebook/parsing/json.py
Normal file
121
users/wpcarro/scratch/facebook/parsing/json.py
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
from parser import Parser
|
||||
|
||||
# As an exercise to stress-test my understanding of recursive descent parsers,
|
||||
# I'm attempting to write a JSON parser without referencing any existing BNF
|
||||
# descriptions of JSON or existing JSON parser implementations.
|
||||
#
|
||||
# I'm only parsing a subset of JSON: enough to parse `sample`. Here is the BNF
|
||||
# that I wrote to describe my expected input:
|
||||
#
|
||||
# expression -> object
|
||||
# object -> '{' ( STRING ':' expression ) ( ',' STRING ':' expression )* '}'
|
||||
# | array
|
||||
# array -> '[' expression ( ',' expression )* ']'
|
||||
# | literal
|
||||
# literal -> STRING | INT
|
||||
|
||||
def tokenize(xs):
|
||||
"""
|
||||
Return a list of tokens from the string input, `xs`.
|
||||
"""
|
||||
result = []
|
||||
i = 0
|
||||
while i < len(xs):
|
||||
# single characters
|
||||
if xs[i] in ",{}:[]":
|
||||
result.append(xs[i])
|
||||
i += 1
|
||||
# strings
|
||||
elif xs[i] == "\"":
|
||||
curr = xs[i]
|
||||
i += 1
|
||||
while xs[i] != "\"":
|
||||
curr += xs[i]
|
||||
i += 1
|
||||
curr += xs[i]
|
||||
result.append(curr)
|
||||
i += 1
|
||||
# integers
|
||||
elif xs[i] in "0123456789":
|
||||
curr = xs[i]
|
||||
i += 1
|
||||
while xs[i] in "0123456789":
|
||||
curr += xs[i]
|
||||
i += 1
|
||||
result.append(int(curr))
|
||||
# whitespace
|
||||
elif xs[i] in {" ", "\n"}:
|
||||
i += 1
|
||||
return result
|
||||
|
||||
def parse_json(x):
|
||||
"""
|
||||
Attempt to parse the string, `x`, into JSON.
|
||||
"""
|
||||
tokens = tokenize(x)
|
||||
return parse_object(Parser(tokens))
|
||||
|
||||
def parse_object(parser):
|
||||
if parser.match(['{']):
|
||||
key = parse_string(parser)
|
||||
parser.expect([':'])
|
||||
value = parse_object(parser)
|
||||
result = [(key, value)]
|
||||
while parser.match([',']):
|
||||
key = parse_string(parser)
|
||||
parser.match([':'])
|
||||
value = parse_object(parser)
|
||||
result.append((key, value))
|
||||
return result
|
||||
return parse_array(parser)
|
||||
|
||||
def parse_array(parser):
|
||||
if parser.match(['[']):
|
||||
if parser.match([']']):
|
||||
return []
|
||||
result = [parse_object(parser)]
|
||||
while parser.match([',']):
|
||||
result.append(parse_object(parser))
|
||||
parser.expect([']'])
|
||||
return result
|
||||
else:
|
||||
return parse_literal(parser)
|
||||
|
||||
def parse_string(parser):
|
||||
if parser.curr().startswith("\""):
|
||||
return parser.consume()
|
||||
else:
|
||||
raise Exception("Unexpected token: {}".format(parser.curr()))
|
||||
|
||||
def parse_literal(parser):
|
||||
return parser.consume()
|
||||
|
||||
sample = """
|
||||
{
|
||||
"glossary": {
|
||||
"title": "example glossary",
|
||||
"GlossDiv": {
|
||||
"title": "S",
|
||||
"GlossList": {
|
||||
"GlossEntry": {
|
||||
"ID": "SGML",
|
||||
"SortAs": "SGML",
|
||||
"GlossTerm": "Standard Generalized Markup Language",
|
||||
"Acronym": "SGML",
|
||||
"Abbrev": "ISO 8879:1986",
|
||||
"GlossDef": {
|
||||
"para": "A meta-markup language, used to create markup languages such as DocBook.",
|
||||
"GlossSeeAlso": [
|
||||
"GML",
|
||||
"XML"
|
||||
]
|
||||
},
|
||||
"GlossSee": "markup"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
print(parse_json(sample))
|
||||
28
users/wpcarro/scratch/facebook/parsing/parser.py
Normal file
28
users/wpcarro/scratch/facebook/parsing/parser.py
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
class Parser(object):
|
||||
def __init__(self, tokens):
|
||||
self.tokens = tokens
|
||||
self.i = 0
|
||||
|
||||
def prev(self):
|
||||
return self.tokens[self.i - 1]
|
||||
|
||||
def curr(self):
|
||||
return self.tokens[self.i]
|
||||
|
||||
def consume(self):
|
||||
if not self.exhausted():
|
||||
self.i += 1
|
||||
return self.prev()
|
||||
|
||||
def match(self, xs):
|
||||
if not self.exhausted() and self.curr() in xs:
|
||||
self.consume()
|
||||
return True
|
||||
return False
|
||||
|
||||
def expect(self, xs):
|
||||
if not self.match(xs):
|
||||
raise Exception("Expected token \"{}\" but received \"{}\"".format(xs, self.curr()))
|
||||
|
||||
def exhausted(self):
|
||||
return self.i >= len(self.tokens)
|
||||
184
users/wpcarro/scratch/facebook/parsing/regex.py
Normal file
184
users/wpcarro/scratch/facebook/parsing/regex.py
Normal file
|
|
@ -0,0 +1,184 @@
|
|||
# Writing a small proof-of-concept...
|
||||
# - lexer
|
||||
# - parser
|
||||
# - compiler
|
||||
# ...for regex.
|
||||
#
|
||||
# BNF
|
||||
# expression -> ( char_class | CHAR ) quantifier? ( "|" expression )*
|
||||
# char_class -> "[" CHAR+ "]"
|
||||
# quantifier -> "?" | "*" | "+" | "{" INT? "," INT? "}"
|
||||
#
|
||||
# Of the numerous things I do not support, here are a few items of which I'm
|
||||
# aware:
|
||||
# - alternatives: (a|b)
|
||||
# - capture groups: (ab)cd
|
||||
|
||||
from parser import Parser
|
||||
import string
|
||||
|
||||
################################################################################
|
||||
# Top-Level API
|
||||
################################################################################
|
||||
|
||||
def tokenize(xs):
|
||||
"""
|
||||
Transform `xs` into a list of tokens.
|
||||
|
||||
Also: expand shorthand symbols using the following table:
|
||||
- ? -> {0,1}
|
||||
- * -> {0,}
|
||||
- + -> {1,}
|
||||
"""
|
||||
result = []
|
||||
i = 0
|
||||
shorthand = {
|
||||
"?": ["{", 0, ",", 1, "}"],
|
||||
"*": ["{", 0, ",", "}"],
|
||||
"+": ["{", 1, ",", "}"],
|
||||
}
|
||||
while i < len(xs):
|
||||
if xs[i] in shorthand:
|
||||
for c in shorthand[xs[i]]:
|
||||
result.append(c)
|
||||
i += 1
|
||||
elif xs[i] == "{":
|
||||
result.append(xs[i])
|
||||
i += 1
|
||||
curr = ""
|
||||
while xs[i] in string.digits:
|
||||
curr += xs[i]
|
||||
i += 1
|
||||
result.append(int(curr))
|
||||
assert xs[i] == ","
|
||||
result.append(",")
|
||||
i += 1
|
||||
curr = ""
|
||||
while xs[i] in string.digits:
|
||||
curr += xs[i]
|
||||
i += 1
|
||||
result.append(int(curr))
|
||||
else:
|
||||
result.append(xs[i])
|
||||
i += 1
|
||||
return result
|
||||
|
||||
def parse(expr):
|
||||
"""
|
||||
Tokenize `expr` and convert it into a parse-tree.
|
||||
"""
|
||||
tokens = tokenize(expr)
|
||||
return parse_tokens(tokens)
|
||||
|
||||
def compile(xs):
|
||||
"""
|
||||
Transform `xs`, a parse-tree representing a regex, into a function that
|
||||
accepts a string, and returns the substring that the regex matches.
|
||||
"""
|
||||
def fn(input):
|
||||
match = ""
|
||||
i = 0
|
||||
for x in xs:
|
||||
matches, q = x[1], x[2]
|
||||
lo, hi = q[1], q[2]
|
||||
for j in range(lo):
|
||||
if i < len(input) and input[i] in matches:
|
||||
match += input[i]
|
||||
i += 1
|
||||
else:
|
||||
print("Failed to match {} with {}".format(input[i], matches))
|
||||
return None
|
||||
if hi == float('inf'):
|
||||
while i < len(input) and input[i] in matches:
|
||||
match += input[i]
|
||||
i += 1
|
||||
else:
|
||||
for j in range(hi - lo):
|
||||
if i < len(input) and input[i] in matches:
|
||||
match += input[i]
|
||||
i += 1
|
||||
return match
|
||||
return fn
|
||||
|
||||
################################################################################
|
||||
# Helper Functions
|
||||
################################################################################
|
||||
|
||||
def parse_tokens(tokens):
|
||||
result = []
|
||||
parser = Parser(tokens)
|
||||
while not parser.exhausted():
|
||||
result.append(parse_expression(parser))
|
||||
return result
|
||||
|
||||
def parse_expression(parser):
|
||||
if parser.curr() == "[":
|
||||
return parse_character_class(parser)
|
||||
else:
|
||||
return parse_character(parser)
|
||||
|
||||
def parse_character_class(parser):
|
||||
parser.expect("[")
|
||||
beg = parser.consume()
|
||||
parser.expect("-")
|
||||
end = parser.consume()
|
||||
parser.expect("]")
|
||||
if parser.curr() == "{":
|
||||
q = parse_quantifier(parser)
|
||||
return char_class(xs=expand_range(beg, end), q=q)
|
||||
|
||||
def parse_quantifier(parser):
|
||||
parser.expect("{")
|
||||
if parser.match([","]):
|
||||
end = parser.consume()
|
||||
parser.expect("}")
|
||||
return quantifier(beg=0, end=end)
|
||||
else:
|
||||
beg = parser.consume()
|
||||
parser.expect(",")
|
||||
if parser.match(["}"]):
|
||||
return quantifier(beg=beg)
|
||||
else:
|
||||
end = parser.consume()
|
||||
parser.expect("}")
|
||||
return quantifier(beg=beg, end=end)
|
||||
|
||||
def parse_character(parser):
|
||||
c = parser.consume()
|
||||
q = None
|
||||
if parser.curr() == "{":
|
||||
q = parse_quantifier(parser)
|
||||
return char_class(xs={c}, q=q)
|
||||
|
||||
def char_class(xs=set(), q=None):
|
||||
if not q:
|
||||
q = quantifier(beg=1, end=1)
|
||||
return ["CHARACTER_CLASS", xs, q]
|
||||
|
||||
def expand_range(beg, end):
|
||||
# TODO: Implement this
|
||||
return {string.printable[i]
|
||||
for i in range(string.printable.index(beg),
|
||||
string.printable.index(end) + 1)}
|
||||
|
||||
def quantifier(beg=0, end=float('inf')):
|
||||
return ['QUANTIFIER', beg, end]
|
||||
|
||||
################################################################################
|
||||
# Tests
|
||||
################################################################################
|
||||
|
||||
xs = [
|
||||
("[a-c]*[0-9]{2,3}", ["dog"]),
|
||||
("ca+t?", ["cat", "caaaat", "ca", "dog"]),
|
||||
]
|
||||
|
||||
for re, inputs in xs:
|
||||
print("Regex: {}".format(re))
|
||||
print("Tokens: {}".format(tokenize(re)))
|
||||
print("Parsed: {}".format(parse(re)))
|
||||
print("\nTESTS")
|
||||
for input in inputs:
|
||||
print("Attempting to match \"{}\"...".format(input))
|
||||
parser = compile(parse(re))
|
||||
print("Result: \"{}\"\n".format(parser(input)))
|
||||
Loading…
Add table
Add a link
Reference in a new issue