Write encoded XML parser and pretty-printer
Write a function that reads a string of compressed XML and outputs the decompressed version. Note to self: Now that I'm growing more comfortable writing parsers, I'd like to become equally comfortable writing pretty-printers.
This commit is contained in:
		
							parent
							
								
									bfd2180e6b
								
							
						
					
					
						commit
						c841527f61
					
				
					 2 changed files with 135 additions and 0 deletions
				
			
		
							
								
								
									
										98
									
								
								scratch/facebook/moderate/decompress-xml.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										98
									
								
								scratch/facebook/moderate/decompress-xml.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,98 @@ | ||||||
|  | import string | ||||||
|  | from parser import Parser | ||||||
|  | 
 | ||||||
|  | mapping = { | ||||||
|  |     1: "family", | ||||||
|  |     2: "person", | ||||||
|  |     3: "firstName", | ||||||
|  |     4: "lastName", | ||||||
|  |     5: "state", | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | def parse_int(i, xs): | ||||||
|  |     result = "" | ||||||
|  |     while i < len(xs) and xs[i] in string.digits: | ||||||
|  |         result += xs[i] | ||||||
|  |         i += 1 | ||||||
|  |     return i, int(result) | ||||||
|  | 
 | ||||||
|  | def parse_string(i, xs): | ||||||
|  |     result = "" | ||||||
|  |     while xs[i+1] not in string.digits: | ||||||
|  |         result += xs[i] | ||||||
|  |         i += 1 | ||||||
|  |     return i, result | ||||||
|  | 
 | ||||||
|  | def tokenize(xs): | ||||||
|  |     result = [] | ||||||
|  |     i = 0 | ||||||
|  |     while i < len(xs): | ||||||
|  |         if xs[i] in string.digits: | ||||||
|  |             i, n = parse_int(i, xs) | ||||||
|  |             result.append(n) | ||||||
|  |         elif xs[i] in string.ascii_letters: | ||||||
|  |             i, x = parse_string(i, xs) | ||||||
|  |             result.append(x) | ||||||
|  |         elif xs[i] == " ": | ||||||
|  |             i += 1 | ||||||
|  |             continue | ||||||
|  |     return result | ||||||
|  | 
 | ||||||
|  | def parse(xs): | ||||||
|  |     parser = Parser(tokenize(xs)) | ||||||
|  |     return parse_element(parser) | ||||||
|  | 
 | ||||||
|  | # Element   -> Tag Attribute* End Element* End ; | ||||||
|  | # Tag       -> INTEGER ; | ||||||
|  | # Value     -> STRING End ; | ||||||
|  | # Attribute -> Tag Value ; | ||||||
|  | # End       -> 0 ; | ||||||
|  | 
 | ||||||
|  | def parse_element(parser): | ||||||
|  |     if type(parser.curr()) == str: | ||||||
|  |         return parser.consume() | ||||||
|  |     tag_id = parser.expect_predicate(lambda x: type(x) == int) | ||||||
|  |     tag = mapping[tag_id] | ||||||
|  |     attrs = parse_attrs(parser) | ||||||
|  |     parser.expect([0]) | ||||||
|  |     children = [] | ||||||
|  |     while not parser.exhausted() and parser.curr() != 0: | ||||||
|  |         children.append(parse_element(parser)) | ||||||
|  |     parser.expect([0]) | ||||||
|  |     return [tag, attrs, children] | ||||||
|  | 
 | ||||||
|  | def parse_attrs(parser): | ||||||
|  |     result = [] | ||||||
|  |     while parser.curr() != 0: | ||||||
|  |         tag_id = parser.expect_predicate(lambda x: type(x) == int) | ||||||
|  |         tag = mapping[tag_id] | ||||||
|  |         value = parser.consume() | ||||||
|  |         result.append((tag, value)) | ||||||
|  |     return result | ||||||
|  | 
 | ||||||
|  | def stringify_xml(tree, indent=0): | ||||||
|  |     if type(tree) == str: | ||||||
|  |         return tree | ||||||
|  |     result = "" | ||||||
|  |     tag, attrs, children = tree | ||||||
|  | 
 | ||||||
|  |     str_attrs = [] | ||||||
|  |     for k, v in attrs: | ||||||
|  |         str_attrs.append("{}=\"{}\"".format(k, v)) | ||||||
|  |     str_attrs = (" " if str_attrs else "") + " ".join(str_attrs) | ||||||
|  | 
 | ||||||
|  |     str_children = [] | ||||||
|  |     for child in children: | ||||||
|  |         str_children.append(" " * 2 * indent + stringify_xml(child, indent + 1)) | ||||||
|  |     str_children = "\n".join(str_children) | ||||||
|  | 
 | ||||||
|  |     result += "{}<{}{}>\n{}{}\n{}</{}>".format( | ||||||
|  |         " " * 2 * indent, tag, str_attrs, " " * 2 * indent, str_children, | ||||||
|  |         " " * 2 * indent, tag) | ||||||
|  |     return result | ||||||
|  | 
 | ||||||
|  | x = "1 4 McDowell 5 CA 0 2 3 Gayle 0 Some Message 0 0" | ||||||
|  | print("Input:   {}".format(x)) | ||||||
|  | print("Tokens:  {}".format(tokenize(x))) | ||||||
|  | print("Parsed:  {}".format(parse(x))) | ||||||
|  | print("{}".format(stringify_xml(parse(x)))) | ||||||
							
								
								
									
										37
									
								
								scratch/facebook/moderate/parser.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								scratch/facebook/moderate/parser.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | ||||||
|  | class Parser(object): | ||||||
|  |     def __init__(self, tokens): | ||||||
|  |         self.tokens = tokens | ||||||
|  |         self.i = 0 | ||||||
|  | 
 | ||||||
|  |     def prev(self): | ||||||
|  |         return self.tokens[self.i - 1] | ||||||
|  | 
 | ||||||
|  |     def curr(self): | ||||||
|  |         return self.tokens[self.i] | ||||||
|  | 
 | ||||||
|  |     def next(self): | ||||||
|  |         return self.tokens[self.i + 1] | ||||||
|  | 
 | ||||||
|  |     def consume(self): | ||||||
|  |         if not self.exhausted(): | ||||||
|  |             self.i += 1 | ||||||
|  |             return self.prev() | ||||||
|  | 
 | ||||||
|  |     def match(self, xs): | ||||||
|  |         if not self.exhausted() and self.curr() in xs: | ||||||
|  |             self.consume() | ||||||
|  |             return True | ||||||
|  |         return False | ||||||
|  | 
 | ||||||
|  |     def expect(self, xs): | ||||||
|  |         if not self.match(xs): | ||||||
|  |             raise Exception("Expected token \"{}\" but received \"{}\"".format(xs, self.curr())) | ||||||
|  |         return self.prev() | ||||||
|  | 
 | ||||||
|  |     def expect_predicate(self, predicate): | ||||||
|  |         if predicate(self.curr()): | ||||||
|  |             return self.consume() | ||||||
|  |         raise Exception("Expected token \"{}\" to pass predicate, but it did not".format(self.curr())) | ||||||
|  | 
 | ||||||
|  |     def exhausted(self): | ||||||
|  |         return self.i >= len(self.tokens) | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue