Implement the Rabin Karp string matching algorithm
This algorithm is pretty interesting because it runs in linear time with respect to the length of the `corpus` string. It does this by using a sliding window hash. This hash -- because it's a sliding window -- runs in constant time for each iteration; we're only adding and subtracting one character each time and not re-hashing the whole "window". When our hashes match, only then do we compare the "window" to the `pattern`. String comparisons are linear because they compare each character to each character one at a time. But because we only compare strings when are hashes match (a check which runs in constant time), this spares us the performance hit.
This commit is contained in:
		
							parent
							
								
									a2fa88f561
								
							
						
					
					
						commit
						6989c3a91a
					
				
					 1 changed files with 27 additions and 0 deletions
				
			
		
							
								
								
									
										27
									
								
								scratch/facebook/rabin-karp.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								scratch/facebook/rabin-karp.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,27 @@ | |||
| def substring_exists(corpus, pattern): | ||||
|     """ | ||||
|     Return True if `pattern` appears in `corpus`. | ||||
| 
 | ||||
|     This function runs in O(m) time where n is equal to the length of | ||||
|     `corpus`. To improve the efficiency of this algorithm, use a hashing | ||||
|     function the reduces the number of collisions, which will consequently | ||||
|     reduce the number of string-to-string, linear comparisons. | ||||
|     """ | ||||
|     m, n = len(corpus), len(pattern) | ||||
|     a = sum(ord(c) for c in corpus[0:n]) | ||||
|     b = sum(ord(c) for c in pattern) | ||||
| 
 | ||||
|     # (clumsily) prevent an off-by-one error... | ||||
|     if a == b and corpus[0:n] == pattern: | ||||
|         return True | ||||
| 
 | ||||
|     for i in range(1, m - n): | ||||
|         # Update the hash of corpus by subtracting the hash of the character | ||||
|         # that is sliding out of view and adding the hash of the character that | ||||
|         # is sliding into view. | ||||
|         a = a - ord(corpus[i - 1]) + ord(corpus[i + n - 1]) | ||||
|         # Integer comparison in O(0) time followed by string comparison in O(m) | ||||
|         # time. | ||||
|         if a == b and corpus[i:i + n] == pattern: | ||||
|             return True | ||||
|     return False | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue