Merge branch 'tokenize' of https://github.com/nbp/nix
This commit is contained in:
		
						commit
						2ee1b9359b
					
				
					 3 changed files with 153 additions and 0 deletions
				
			
		| 
						 | 
				
			
			@ -873,6 +873,43 @@ builtins.sort builtins.lessThan [ 483 249 526 147 42 77 ]
 | 
			
		|||
  </varlistentry>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  <varlistentry><term><function>builtins.split</function>
 | 
			
		||||
  <replaceable>regex</replaceable> <replaceable>str</replaceable></term>
 | 
			
		||||
 | 
			
		||||
  <listitem><para>Returns a list composed of non matched strings interleaved
 | 
			
		||||
  with the lists of the <link
 | 
			
		||||
  xlink:href="http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04">extended
 | 
			
		||||
  POSIX regular expression</link> <replaceable>regex</replaceable> matches
 | 
			
		||||
  of <replaceable>str</replaceable>. Each item in the lists of matched
 | 
			
		||||
  sequences is a regex group.
 | 
			
		||||
 | 
			
		||||
<programlisting>
 | 
			
		||||
builtins.split "(a)b" "abc"
 | 
			
		||||
</programlisting>
 | 
			
		||||
 | 
			
		||||
Evaluates to <literal>[ "" [ "a" ] "c" ]</literal>.
 | 
			
		||||
 | 
			
		||||
<programlisting>
 | 
			
		||||
builtins.split "([ac])" "abc"
 | 
			
		||||
</programlisting>
 | 
			
		||||
 | 
			
		||||
Evaluates to <literal>[ "" [ "a" ] "b" [ "c" ] "" ]</literal>.
 | 
			
		||||
 | 
			
		||||
<programlisting>
 | 
			
		||||
builtins.split "(a)|(c)" "abc"
 | 
			
		||||
</programlisting>
 | 
			
		||||
 | 
			
		||||
Evaluates to <literal>[ "" [ "a" null ] "b" [ null "c" ] "" ]</literal>.
 | 
			
		||||
 | 
			
		||||
<programlisting>
 | 
			
		||||
builtins.split "([[:upper:]]+)" "  FOO   "
 | 
			
		||||
</programlisting>
 | 
			
		||||
 | 
			
		||||
Evaluates to <literal>[ "  " [ "FOO" ] "   " ]</literal>.
 | 
			
		||||
 | 
			
		||||
  </para></listitem>
 | 
			
		||||
  </varlistentry>
 | 
			
		||||
 | 
			
		||||
  <varlistentry><term><function>builtins.stringLength</function>
 | 
			
		||||
  <replaceable>e</replaceable></term>
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1745,6 +1745,73 @@ static void prim_match(EvalState & state, const Pos & pos, Value * * args, Value
 | 
			
		|||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/* Split a string with a regular expression, and return a list of the
 | 
			
		||||
   non-matching parts interleaved by the lists of the matching groups. */
 | 
			
		||||
static void prim_split(EvalState & state, const Pos & pos, Value * * args, Value & v)
 | 
			
		||||
{
 | 
			
		||||
    auto re = state.forceStringNoCtx(*args[0], pos);
 | 
			
		||||
 | 
			
		||||
    try {
 | 
			
		||||
 | 
			
		||||
        std::regex regex(re, std::regex::extended);
 | 
			
		||||
 | 
			
		||||
        PathSet context;
 | 
			
		||||
        const std::string str = state.forceString(*args[1], context, pos);
 | 
			
		||||
 | 
			
		||||
        auto begin = std::sregex_iterator(str.begin(), str.end(), regex);
 | 
			
		||||
        auto end = std::sregex_iterator();
 | 
			
		||||
 | 
			
		||||
        // Any matches results are surrounded by non-matching results.
 | 
			
		||||
        const size_t len = std::distance(begin, end);
 | 
			
		||||
        state.mkList(v, 2 * len + 1);
 | 
			
		||||
        size_t idx = 0;
 | 
			
		||||
        Value * elem;
 | 
			
		||||
 | 
			
		||||
        if (len == 0) {
 | 
			
		||||
            v.listElems()[idx++] = args[1];
 | 
			
		||||
            return;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for (std::sregex_iterator i = begin; i != end; ++i) {
 | 
			
		||||
            assert(idx <= 2 * len + 1 - 3);
 | 
			
		||||
            std::smatch match = *i;
 | 
			
		||||
 | 
			
		||||
            // Add a string for non-matched characters.
 | 
			
		||||
            elem = v.listElems()[idx++] = state.allocValue();
 | 
			
		||||
            mkString(*elem, match.prefix().str().c_str());
 | 
			
		||||
 | 
			
		||||
            // Add a list for matched substrings.
 | 
			
		||||
            const size_t slen = match.size() - 1;
 | 
			
		||||
            elem = v.listElems()[idx++] = state.allocValue();
 | 
			
		||||
 | 
			
		||||
            // Start at 1, beacause the first match is the whole string.
 | 
			
		||||
            state.mkList(*elem, slen);
 | 
			
		||||
            for (size_t si = 0; si < slen; ++si) {
 | 
			
		||||
                if (!match[si + 1].matched)
 | 
			
		||||
                    mkNull(*(elem->listElems()[si] = state.allocValue()));
 | 
			
		||||
                else
 | 
			
		||||
                    mkString(*(elem->listElems()[si] = state.allocValue()), match[si + 1].str().c_str());
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Add a string for non-matched suffix characters.
 | 
			
		||||
            if (idx == 2 * len) {
 | 
			
		||||
                elem = v.listElems()[idx++] = state.allocValue();
 | 
			
		||||
                mkString(*elem, match.suffix().str().c_str());
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        assert(idx == 2 * len + 1);
 | 
			
		||||
 | 
			
		||||
    } catch (std::regex_error &e) {
 | 
			
		||||
        if (e.code() == std::regex_constants::error_space) {
 | 
			
		||||
          // limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
 | 
			
		||||
          throw EvalError("memory limit exceeded by regular expression '%s', at %s", re, pos);
 | 
			
		||||
        } else {
 | 
			
		||||
          throw EvalError("invalid regular expression '%s', at %s", re, pos);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
static void prim_concatStringSep(EvalState & state, const Pos & pos, Value * * args, Value & v)
 | 
			
		||||
{
 | 
			
		||||
    PathSet context;
 | 
			
		||||
| 
						 | 
				
			
			@ -2039,6 +2106,7 @@ void EvalState::createBaseEnv()
 | 
			
		|||
    addPrimOp("__unsafeDiscardOutputDependency", 1, prim_unsafeDiscardOutputDependency);
 | 
			
		||||
    addPrimOp("__hashString", 2, prim_hashString);
 | 
			
		||||
    addPrimOp("__match", 2, prim_match);
 | 
			
		||||
    addPrimOp("__split", 2, prim_split);
 | 
			
		||||
    addPrimOp("__concatStringsSep", 2, prim_concatStringSep);
 | 
			
		||||
    addPrimOp("__replaceStrings", 3, prim_replaceStrings);
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										48
									
								
								tests/lang/eval-okay-regex-split.nix
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								tests/lang/eval-okay-regex-split.nix
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,48 @@
 | 
			
		|||
with builtins;
 | 
			
		||||
 | 
			
		||||
# Non capturing regex returns empty lists
 | 
			
		||||
assert  split "foobar" "foobar"  == ["" [] ""];
 | 
			
		||||
assert  split "fo*" "f"          == ["" [] ""];
 | 
			
		||||
assert  split "fo+" "f"          == ["f"];
 | 
			
		||||
assert  split "fo*" "fo"         == ["" [] ""];
 | 
			
		||||
assert  split "fo*" "foo"        == ["" [] ""];
 | 
			
		||||
assert  split "fo+" "foo"        == ["" [] ""];
 | 
			
		||||
assert  split "fo{1,2}" "foo"    == ["" [] ""];
 | 
			
		||||
assert  split "fo{1,2}" "fooo"   == ["" [] "o"];
 | 
			
		||||
assert  split "fo*" "foobar"     == ["" [] "bar"];
 | 
			
		||||
 | 
			
		||||
# Capturing regex returns a list of sub-matches
 | 
			
		||||
assert  split "(fo*)" "f"        == ["" ["f"] ""];
 | 
			
		||||
assert  split "(fo+)" "f"        == ["f"];
 | 
			
		||||
assert  split "(fo*)" "fo"       == ["" ["fo"] ""];
 | 
			
		||||
assert  split "(f)(o*)" "f"      == ["" ["f" ""] ""];
 | 
			
		||||
assert  split "(f)(o*)" "foo"    == ["" ["f" "oo"] ""];
 | 
			
		||||
assert  split "(fo+)" "foo"      == ["" ["foo"] ""];
 | 
			
		||||
assert  split "(fo{1,2})" "foo"  == ["" ["foo"] ""];
 | 
			
		||||
assert  split "(fo{1,2})" "fooo" == ["" ["foo"] "o"];
 | 
			
		||||
assert  split "(fo*)" "foobar"   == ["" ["foo"] "bar"];
 | 
			
		||||
 | 
			
		||||
# Matches are greedy.
 | 
			
		||||
assert  split "(o+)" "oooofoooo" == ["" ["oooo"] "f" ["oooo"] ""];
 | 
			
		||||
 | 
			
		||||
# Matches multiple times.
 | 
			
		||||
assert  split "(b)" "foobarbaz"  == ["foo" ["b"] "ar" ["b"] "az"];
 | 
			
		||||
 | 
			
		||||
# Split large strings containing newlines. null are inserted when a
 | 
			
		||||
# pattern within the current did not match anything.
 | 
			
		||||
assert  split "[[:space:]]+|([',.!?])" ''
 | 
			
		||||
  Nix Rocks!
 | 
			
		||||
  That's why I use it.
 | 
			
		||||
''  == [
 | 
			
		||||
  "Nix" [ null ] "Rocks" ["!"] "" [ null ]
 | 
			
		||||
  "That" ["'"] "s" [ null ] "why" [ null ] "I" [ null ] "use" [ null ] "it" ["."] "" [ null ]
 | 
			
		||||
  ""
 | 
			
		||||
];
 | 
			
		||||
 | 
			
		||||
# Documentation examples
 | 
			
		||||
assert  split  "(a)b" "abc"      == [ "" [ "a" ] "c" ];
 | 
			
		||||
assert  split  "([ac])" "abc"    == [ "" [ "a" ] "b" [ "c" ] "" ];
 | 
			
		||||
assert  split  "(a)|(c)" "abc"   == [ "" [ "a" null ] "b" [ null "c" ] "" ];
 | 
			
		||||
assert  split  "([[:upper:]]+)" "  FOO   " == [ "  " [ "FOO" ] "   " ];
 | 
			
		||||
 | 
			
		||||
true
 | 
			
		||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue