Add builtins.string function.
The function 'builtins.split' takes a POSIX extended regular expression
and an arbitrary string. It returns a list of non-matching substring
interleaved by lists of matched groups of the regular expression.
```nix
with builtins;
assert split "(a)b" "abc"      == [ "" [ "a" ] "c" ];
assert split "([ac])" "abc"    == [ "" [ "a" ] "b" [ "c" ] "" ];
assert split "(a)|(c)" "abc"   == [ "" [ "a" null ] "b" [ null "c" ] "" ];
assert split "([[:upper:]]+)" "  FOO   "
                               == [ "  " [ "FOO" ] "   " ];
```
			
			
This commit is contained in:
		
							parent
							
								
									f76e85d8f5
								
							
						
					
					
						commit
						b8867a0239
					
				
					 3 changed files with 153 additions and 0 deletions
				
			
		|  | @ -873,6 +873,43 @@ builtins.sort builtins.lessThan [ 483 249 526 147 42 77 ] | |||
|   </varlistentry> | ||||
| 
 | ||||
| 
 | ||||
|   <varlistentry><term><function>builtins.split</function> | ||||
|   <replaceable>regex</replaceable> <replaceable>str</replaceable></term> | ||||
| 
 | ||||
|   <listitem><para>Returns a list composed of non matched strings interleaved | ||||
|   with the lists of the <link | ||||
|   xlink:href="http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04">extended | ||||
|   POSIX regular expression</link> <replaceable>regex</replaceable> matches | ||||
|   of <replaceable>str</replaceable>. Each item in the lists of matched | ||||
|   sequences is a regex group. | ||||
| 
 | ||||
| <programlisting> | ||||
| builtins.split "(a)b" "abc" | ||||
| </programlisting> | ||||
| 
 | ||||
| Evaluates to <literal>[ "" [ "a" ] "c" ]</literal>. | ||||
| 
 | ||||
| <programlisting> | ||||
| builtins.split "([ac])" "abc" | ||||
| </programlisting> | ||||
| 
 | ||||
| Evaluates to <literal>[ "" [ "a" ] "b" [ "c" ] "" ]</literal>. | ||||
| 
 | ||||
| <programlisting> | ||||
| builtins.split "(a)|(c)" "abc" | ||||
| </programlisting> | ||||
| 
 | ||||
| Evaluates to <literal>[ "" [ "a" null ] "b" [ null "c" ] "" ]</literal>. | ||||
| 
 | ||||
| <programlisting> | ||||
| builtins.split "([[:upper:]]+)" "  FOO   " | ||||
| </programlisting> | ||||
| 
 | ||||
| Evaluates to <literal>[ "  " [ "FOO" ] "   " ]</literal>. | ||||
| 
 | ||||
|   </para></listitem> | ||||
|   </varlistentry> | ||||
| 
 | ||||
|   <varlistentry><term><function>builtins.stringLength</function> | ||||
|   <replaceable>e</replaceable></term> | ||||
| 
 | ||||
|  |  | |||
|  | @ -1745,6 +1745,73 @@ static void prim_match(EvalState & state, const Pos & pos, Value * * args, Value | |||
| } | ||||
| 
 | ||||
| 
 | ||||
| /* Split a string with a regular expression, and return a list of the
 | ||||
|    non-matching parts interleaved by the lists of the matching groups. */ | ||||
| static void prim_split(EvalState & state, const Pos & pos, Value * * args, Value & v) | ||||
| { | ||||
|     auto re = state.forceStringNoCtx(*args[0], pos); | ||||
| 
 | ||||
|     try { | ||||
| 
 | ||||
|         std::regex regex(re, std::regex::extended); | ||||
| 
 | ||||
|         PathSet context; | ||||
|         const std::string str = state.forceString(*args[1], context, pos); | ||||
| 
 | ||||
|         auto begin = std::sregex_iterator(str.begin(), str.end(), regex); | ||||
|         auto end = std::sregex_iterator(); | ||||
| 
 | ||||
|         // Any matches results are surrounded by non-matching results.
 | ||||
|         const size_t len = std::distance(begin, end); | ||||
|         state.mkList(v, 2 * len + 1); | ||||
|         size_t idx = 0; | ||||
|         Value * elem; | ||||
| 
 | ||||
|         if (len == 0) { | ||||
|             v.listElems()[idx++] = args[1]; | ||||
|             return; | ||||
|         } | ||||
| 
 | ||||
|         for (std::sregex_iterator i = begin; i != end; ++i) { | ||||
|             assert(idx <= 2 * len + 1 - 3); | ||||
|             std::smatch match = *i; | ||||
| 
 | ||||
|             // Add a string for non-matched characters.
 | ||||
|             elem = v.listElems()[idx++] = state.allocValue(); | ||||
|             mkString(*elem, match.prefix().str().c_str()); | ||||
| 
 | ||||
|             // Add a list for matched substrings.
 | ||||
|             const size_t slen = match.size() - 1; | ||||
|             elem = v.listElems()[idx++] = state.allocValue(); | ||||
| 
 | ||||
|             // Start at 1, beacause the first match is the whole string.
 | ||||
|             state.mkList(*elem, slen); | ||||
|             for (size_t si = 0; si < slen; ++si) { | ||||
|                 if (!match[si + 1].matched) | ||||
|                     mkNull(*(elem->listElems()[si] = state.allocValue())); | ||||
|                 else | ||||
|                     mkString(*(elem->listElems()[si] = state.allocValue()), match[si + 1].str().c_str()); | ||||
|             } | ||||
| 
 | ||||
|             // Add a string for non-matched suffix characters.
 | ||||
|             if (idx == 2 * len) { | ||||
|                 elem = v.listElems()[idx++] = state.allocValue(); | ||||
|                 mkString(*elem, match.suffix().str().c_str()); | ||||
|             } | ||||
|         } | ||||
|         assert(idx == 2 * len + 1); | ||||
| 
 | ||||
|     } catch (std::regex_error &e) { | ||||
|         if (e.code() == std::regex_constants::error_space) { | ||||
|           // limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
 | ||||
|           throw EvalError("memory limit exceeded by regular expression '%s', at %s", re, pos); | ||||
|         } else { | ||||
|           throw EvalError("invalid regular expression '%s', at %s", re, pos); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| static void prim_concatStringSep(EvalState & state, const Pos & pos, Value * * args, Value & v) | ||||
| { | ||||
|     PathSet context; | ||||
|  | @ -2039,6 +2106,7 @@ void EvalState::createBaseEnv() | |||
|     addPrimOp("__unsafeDiscardOutputDependency", 1, prim_unsafeDiscardOutputDependency); | ||||
|     addPrimOp("__hashString", 2, prim_hashString); | ||||
|     addPrimOp("__match", 2, prim_match); | ||||
|     addPrimOp("__split", 2, prim_split); | ||||
|     addPrimOp("__concatStringsSep", 2, prim_concatStringSep); | ||||
|     addPrimOp("__replaceStrings", 3, prim_replaceStrings); | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										48
									
								
								tests/lang/eval-okay-regex-split.nix
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								tests/lang/eval-okay-regex-split.nix
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,48 @@ | |||
| with builtins; | ||||
| 
 | ||||
| # Non capturing regex returns empty lists | ||||
| assert  split "foobar" "foobar"  == ["" [] ""]; | ||||
| assert  split "fo*" "f"          == ["" [] ""]; | ||||
| assert  split "fo+" "f"          == ["f"]; | ||||
| assert  split "fo*" "fo"         == ["" [] ""]; | ||||
| assert  split "fo*" "foo"        == ["" [] ""]; | ||||
| assert  split "fo+" "foo"        == ["" [] ""]; | ||||
| assert  split "fo{1,2}" "foo"    == ["" [] ""]; | ||||
| assert  split "fo{1,2}" "fooo"   == ["" [] "o"]; | ||||
| assert  split "fo*" "foobar"     == ["" [] "bar"]; | ||||
| 
 | ||||
| # Capturing regex returns a list of sub-matches | ||||
| assert  split "(fo*)" "f"        == ["" ["f"] ""]; | ||||
| assert  split "(fo+)" "f"        == ["f"]; | ||||
| assert  split "(fo*)" "fo"       == ["" ["fo"] ""]; | ||||
| assert  split "(f)(o*)" "f"      == ["" ["f" ""] ""]; | ||||
| assert  split "(f)(o*)" "foo"    == ["" ["f" "oo"] ""]; | ||||
| assert  split "(fo+)" "foo"      == ["" ["foo"] ""]; | ||||
| assert  split "(fo{1,2})" "foo"  == ["" ["foo"] ""]; | ||||
| assert  split "(fo{1,2})" "fooo" == ["" ["foo"] "o"]; | ||||
| assert  split "(fo*)" "foobar"   == ["" ["foo"] "bar"]; | ||||
| 
 | ||||
| # Matches are greedy. | ||||
| assert  split "(o+)" "oooofoooo" == ["" ["oooo"] "f" ["oooo"] ""]; | ||||
| 
 | ||||
| # Matches multiple times. | ||||
| assert  split "(b)" "foobarbaz"  == ["foo" ["b"] "ar" ["b"] "az"]; | ||||
| 
 | ||||
| # Split large strings containing newlines. null are inserted when a | ||||
| # pattern within the current did not match anything. | ||||
| assert  split "[[:space:]]+|([',.!?])" '' | ||||
|   Nix Rocks! | ||||
|   That's why I use it. | ||||
| ''  == [ | ||||
|   "Nix" [ null ] "Rocks" ["!"] "" [ null ] | ||||
|   "That" ["'"] "s" [ null ] "why" [ null ] "I" [ null ] "use" [ null ] "it" ["."] "" [ null ] | ||||
|   "" | ||||
| ]; | ||||
| 
 | ||||
| # Documentation examples | ||||
| assert  split  "(a)b" "abc"      == [ "" [ "a" ] "c" ]; | ||||
| assert  split  "([ac])" "abc"    == [ "" [ "a" ] "b" [ "c" ] "" ]; | ||||
| assert  split  "(a)|(c)" "abc"   == [ "" [ "a" null ] "b" [ null "c" ] "" ]; | ||||
| assert  split  "([[:upper:]]+)" "  FOO   " == [ "  " [ "FOO" ] "   " ]; | ||||
| 
 | ||||
| true | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue