Write encoded XML parser and pretty-printer
Write a function that reads a string of compressed XML and outputs the decompressed version. Note to self: Now that I'm growing more comfortable writing parsers, I'd like to become equally comfortable writing pretty-printers.
This commit is contained in:
		
							parent
							
								
									bfd2180e6b
								
							
						
					
					
						commit
						c841527f61
					
				
					 2 changed files with 135 additions and 0 deletions
				
			
		
							
								
								
									
										98
									
								
								scratch/facebook/moderate/decompress-xml.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										98
									
								
								scratch/facebook/moderate/decompress-xml.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,98 @@ | |||
| import string | ||||
| from parser import Parser | ||||
| 
 | ||||
| mapping = { | ||||
|     1: "family", | ||||
|     2: "person", | ||||
|     3: "firstName", | ||||
|     4: "lastName", | ||||
|     5: "state", | ||||
| } | ||||
| 
 | ||||
| def parse_int(i, xs): | ||||
|     result = "" | ||||
|     while i < len(xs) and xs[i] in string.digits: | ||||
|         result += xs[i] | ||||
|         i += 1 | ||||
|     return i, int(result) | ||||
| 
 | ||||
| def parse_string(i, xs): | ||||
|     result = "" | ||||
|     while xs[i+1] not in string.digits: | ||||
|         result += xs[i] | ||||
|         i += 1 | ||||
|     return i, result | ||||
| 
 | ||||
| def tokenize(xs): | ||||
|     result = [] | ||||
|     i = 0 | ||||
|     while i < len(xs): | ||||
|         if xs[i] in string.digits: | ||||
|             i, n = parse_int(i, xs) | ||||
|             result.append(n) | ||||
|         elif xs[i] in string.ascii_letters: | ||||
|             i, x = parse_string(i, xs) | ||||
|             result.append(x) | ||||
|         elif xs[i] == " ": | ||||
|             i += 1 | ||||
|             continue | ||||
|     return result | ||||
| 
 | ||||
| def parse(xs): | ||||
|     parser = Parser(tokenize(xs)) | ||||
|     return parse_element(parser) | ||||
| 
 | ||||
| # Element   -> Tag Attribute* End Element* End ; | ||||
| # Tag       -> INTEGER ; | ||||
| # Value     -> STRING End ; | ||||
| # Attribute -> Tag Value ; | ||||
| # End       -> 0 ; | ||||
| 
 | ||||
| def parse_element(parser): | ||||
|     if type(parser.curr()) == str: | ||||
|         return parser.consume() | ||||
|     tag_id = parser.expect_predicate(lambda x: type(x) == int) | ||||
|     tag = mapping[tag_id] | ||||
|     attrs = parse_attrs(parser) | ||||
|     parser.expect([0]) | ||||
|     children = [] | ||||
|     while not parser.exhausted() and parser.curr() != 0: | ||||
|         children.append(parse_element(parser)) | ||||
|     parser.expect([0]) | ||||
|     return [tag, attrs, children] | ||||
| 
 | ||||
| def parse_attrs(parser): | ||||
|     result = [] | ||||
|     while parser.curr() != 0: | ||||
|         tag_id = parser.expect_predicate(lambda x: type(x) == int) | ||||
|         tag = mapping[tag_id] | ||||
|         value = parser.consume() | ||||
|         result.append((tag, value)) | ||||
|     return result | ||||
| 
 | ||||
| def stringify_xml(tree, indent=0): | ||||
|     if type(tree) == str: | ||||
|         return tree | ||||
|     result = "" | ||||
|     tag, attrs, children = tree | ||||
| 
 | ||||
|     str_attrs = [] | ||||
|     for k, v in attrs: | ||||
|         str_attrs.append("{}=\"{}\"".format(k, v)) | ||||
|     str_attrs = (" " if str_attrs else "") + " ".join(str_attrs) | ||||
| 
 | ||||
|     str_children = [] | ||||
|     for child in children: | ||||
|         str_children.append(" " * 2 * indent + stringify_xml(child, indent + 1)) | ||||
|     str_children = "\n".join(str_children) | ||||
| 
 | ||||
|     result += "{}<{}{}>\n{}{}\n{}</{}>".format( | ||||
|         " " * 2 * indent, tag, str_attrs, " " * 2 * indent, str_children, | ||||
|         " " * 2 * indent, tag) | ||||
|     return result | ||||
| 
 | ||||
| x = "1 4 McDowell 5 CA 0 2 3 Gayle 0 Some Message 0 0" | ||||
| print("Input:   {}".format(x)) | ||||
| print("Tokens:  {}".format(tokenize(x))) | ||||
| print("Parsed:  {}".format(parse(x))) | ||||
| print("{}".format(stringify_xml(parse(x)))) | ||||
							
								
								
									
										37
									
								
								scratch/facebook/moderate/parser.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								scratch/facebook/moderate/parser.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | |||
| class Parser(object): | ||||
|     def __init__(self, tokens): | ||||
|         self.tokens = tokens | ||||
|         self.i = 0 | ||||
| 
 | ||||
|     def prev(self): | ||||
|         return self.tokens[self.i - 1] | ||||
| 
 | ||||
|     def curr(self): | ||||
|         return self.tokens[self.i] | ||||
| 
 | ||||
|     def next(self): | ||||
|         return self.tokens[self.i + 1] | ||||
| 
 | ||||
|     def consume(self): | ||||
|         if not self.exhausted(): | ||||
|             self.i += 1 | ||||
|             return self.prev() | ||||
| 
 | ||||
|     def match(self, xs): | ||||
|         if not self.exhausted() and self.curr() in xs: | ||||
|             self.consume() | ||||
|             return True | ||||
|         return False | ||||
| 
 | ||||
|     def expect(self, xs): | ||||
|         if not self.match(xs): | ||||
|             raise Exception("Expected token \"{}\" but received \"{}\"".format(xs, self.curr())) | ||||
|         return self.prev() | ||||
| 
 | ||||
|     def expect_predicate(self, predicate): | ||||
|         if predicate(self.curr()): | ||||
|             return self.consume() | ||||
|         raise Exception("Expected token \"{}\" to pass predicate, but it did not".format(self.curr())) | ||||
| 
 | ||||
|     def exhausted(self): | ||||
|         return self.i >= len(self.tokens) | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue