subtree(users/wpcarro): docking briefcase at '24f5a642'
				
					
				
			git-subtree-dir: users/wpcarro git-subtree-mainline:464bbcb15cgit-subtree-split:24f5a642afChange-Id: I6105b3762b79126b3488359c95978cadb3efa789
This commit is contained in:
		
						commit
						019f8fd211
					
				
					 766 changed files with 175420 additions and 0 deletions
				
			
		
							
								
								
									
										184
									
								
								users/wpcarro/scratch/facebook/parsing/regex.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										184
									
								
								users/wpcarro/scratch/facebook/parsing/regex.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,184 @@ | |||
| # Writing a small proof-of-concept... | ||||
| #   - lexer | ||||
| #   - parser | ||||
| #   - compiler | ||||
| # ...for regex. | ||||
| # | ||||
| # BNF | ||||
| # expression -> ( char_class | CHAR ) quantifier? ( "|" expression )* | ||||
| # char_class -> "[" CHAR+ "]" | ||||
| # quantifier -> "?" | "*" | "+" | "{" INT? "," INT? "}" | ||||
| # | ||||
| # Of the numerous things I do not support, here are a few items of which I'm | ||||
| # aware: | ||||
| #   - alternatives:   (a|b) | ||||
| #   - capture groups: (ab)cd | ||||
| 
 | ||||
| from parser import Parser | ||||
| import string | ||||
| 
 | ||||
| ################################################################################ | ||||
| # Top-Level API | ||||
| ################################################################################ | ||||
| 
 | ||||
| def tokenize(xs): | ||||
|     """ | ||||
|     Transform `xs` into a list of tokens. | ||||
| 
 | ||||
|     Also: expand shorthand symbols using the following table: | ||||
|       - ? -> {0,1} | ||||
|       - * -> {0,} | ||||
|       - + -> {1,} | ||||
|     """ | ||||
|     result = [] | ||||
|     i = 0 | ||||
|     shorthand = { | ||||
|         "?": ["{", 0, ",", 1, "}"], | ||||
|         "*": ["{", 0, ",", "}"], | ||||
|         "+": ["{", 1, ",", "}"], | ||||
|     } | ||||
|     while i < len(xs): | ||||
|         if xs[i] in shorthand: | ||||
|             for c in shorthand[xs[i]]: | ||||
|                 result.append(c) | ||||
|             i += 1 | ||||
|         elif xs[i] == "{": | ||||
|             result.append(xs[i]) | ||||
|             i += 1 | ||||
|             curr = "" | ||||
|             while xs[i] in string.digits: | ||||
|                 curr += xs[i] | ||||
|                 i += 1 | ||||
|             result.append(int(curr)) | ||||
|             assert xs[i] == "," | ||||
|             result.append(",") | ||||
|             i += 1 | ||||
|             curr = "" | ||||
|             while xs[i] in string.digits: | ||||
|                 curr += xs[i] | ||||
|                 i += 1 | ||||
|             result.append(int(curr)) | ||||
|         else: | ||||
|             result.append(xs[i]) | ||||
|             i += 1 | ||||
|     return result | ||||
| 
 | ||||
| def parse(expr): | ||||
|     """ | ||||
|     Tokenize `expr` and convert it into a parse-tree. | ||||
|     """ | ||||
|     tokens = tokenize(expr) | ||||
|     return parse_tokens(tokens) | ||||
| 
 | ||||
| def compile(xs): | ||||
|     """ | ||||
|     Transform `xs`, a parse-tree representing a regex, into a function that | ||||
|     accepts a string, and returns the substring that the regex matches. | ||||
|     """ | ||||
|     def fn(input): | ||||
|         match = "" | ||||
|         i = 0 | ||||
|         for x in xs: | ||||
|             matches, q = x[1], x[2] | ||||
|             lo, hi = q[1], q[2] | ||||
|             for j in range(lo): | ||||
|                 if i < len(input) and input[i] in matches: | ||||
|                     match += input[i] | ||||
|                     i += 1 | ||||
|                 else: | ||||
|                     print("Failed to match {} with {}".format(input[i], matches)) | ||||
|                     return None | ||||
|             if hi == float('inf'): | ||||
|                 while i < len(input) and input[i] in matches: | ||||
|                     match += input[i] | ||||
|                     i += 1 | ||||
|             else: | ||||
|                 for j in range(hi - lo): | ||||
|                     if i < len(input) and input[i] in matches: | ||||
|                         match += input[i] | ||||
|                         i += 1 | ||||
|         return match | ||||
|     return fn | ||||
| 
 | ||||
| ################################################################################ | ||||
| # Helper Functions | ||||
| ################################################################################ | ||||
| 
 | ||||
| def parse_tokens(tokens): | ||||
|     result = [] | ||||
|     parser = Parser(tokens) | ||||
|     while not parser.exhausted(): | ||||
|         result.append(parse_expression(parser)) | ||||
|     return result | ||||
| 
 | ||||
| def parse_expression(parser): | ||||
|     if parser.curr() == "[": | ||||
|         return parse_character_class(parser) | ||||
|     else: | ||||
|         return parse_character(parser) | ||||
| 
 | ||||
| def parse_character_class(parser): | ||||
|     parser.expect("[") | ||||
|     beg = parser.consume() | ||||
|     parser.expect("-") | ||||
|     end = parser.consume() | ||||
|     parser.expect("]") | ||||
|     if parser.curr() == "{": | ||||
|         q = parse_quantifier(parser) | ||||
|     return char_class(xs=expand_range(beg, end), q=q) | ||||
| 
 | ||||
| def parse_quantifier(parser): | ||||
|     parser.expect("{") | ||||
|     if parser.match([","]): | ||||
|         end = parser.consume() | ||||
|         parser.expect("}") | ||||
|         return quantifier(beg=0, end=end) | ||||
|     else: | ||||
|         beg = parser.consume() | ||||
|         parser.expect(",") | ||||
|         if parser.match(["}"]): | ||||
|             return quantifier(beg=beg) | ||||
|         else: | ||||
|             end = parser.consume() | ||||
|             parser.expect("}") | ||||
|             return quantifier(beg=beg, end=end) | ||||
| 
 | ||||
| def parse_character(parser): | ||||
|     c = parser.consume() | ||||
|     q = None | ||||
|     if parser.curr() == "{": | ||||
|         q = parse_quantifier(parser) | ||||
|     return char_class(xs={c}, q=q) | ||||
| 
 | ||||
| def char_class(xs=set(), q=None): | ||||
|     if not q: | ||||
|         q = quantifier(beg=1, end=1) | ||||
|     return ["CHARACTER_CLASS", xs, q] | ||||
| 
 | ||||
| def expand_range(beg, end): | ||||
|     # TODO: Implement this | ||||
|     return {string.printable[i] | ||||
|             for i in range(string.printable.index(beg), | ||||
|                            string.printable.index(end) + 1)} | ||||
| 
 | ||||
| def quantifier(beg=0, end=float('inf')): | ||||
|     return ['QUANTIFIER', beg, end] | ||||
| 
 | ||||
| ################################################################################ | ||||
| # Tests | ||||
| ################################################################################ | ||||
| 
 | ||||
| xs = [ | ||||
|     ("[a-c]*[0-9]{2,3}", ["dog"]), | ||||
|     ("ca+t?", ["cat", "caaaat", "ca", "dog"]), | ||||
| ] | ||||
| 
 | ||||
| for re, inputs in xs: | ||||
|     print("Regex:  {}".format(re)) | ||||
|     print("Tokens: {}".format(tokenize(re))) | ||||
|     print("Parsed: {}".format(parse(re))) | ||||
|     print("\nTESTS") | ||||
|     for input in inputs: | ||||
|         print("Attempting to match \"{}\"...".format(input)) | ||||
|         parser = compile(parse(re)) | ||||
|         print("Result: \"{}\"\n".format(parser(input))) | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue