shared: add NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY flag for nm_utils_strsplit_set_full()

Previously, nm_utils_strsplit_set_full() would always remove empty
tokens. Add a flag NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY to avoid
that.

This makes nm_utils_strsplit_set_full() return the same result as
g_strsplit_set() and a direct replacement for it -- except for "",
where we return %NULL.
This commit is contained in:
Thomas Haller 2019-04-03 17:32:59 +02:00
parent 453b3ea362
commit 5c1f93943e
3 changed files with 364 additions and 47 deletions

View file

@ -259,55 +259,279 @@ test_nm_g_slice_free_fcn (void)
/*****************************************************************************/
static void
_do_test_nm_utils_strsplit_set (gboolean escape, const char *str, ...)
_do_test_nm_utils_strsplit_set_f_one (NMUtilsStrsplitSetFlags flags,
const char *str,
gsize words_len,
const char *const*exp_words)
{
gs_unref_ptrarray GPtrArray *args_array = g_ptr_array_new ();
const char *const*args;
const char *DELIMITERS = " \t\n";
#define DELIMITERS_C ' ', '\t', '\n'
gs_free const char **words = NULL;
const char *arg;
gsize i;
va_list ap;
gsize i, j, k;
const gboolean f_allow_escaping = NM_FLAGS_HAS (flags, NM_UTILS_STRSPLIT_SET_FLAGS_ALLOW_ESCAPING);
const gboolean f_preserve_empty = NM_FLAGS_HAS (flags, NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY);
const char *s1;
gsize initial_offset;
gs_strfreev char **words_g = NULL;
va_start (ap, str);
while ((arg = va_arg (ap, const char *)))
g_ptr_array_add (args_array, (gpointer) arg);
va_end (ap);
g_ptr_array_add (args_array, NULL);
g_assert (!NM_FLAGS_ANY (flags, ~( NM_UTILS_STRSPLIT_SET_FLAGS_ALLOW_ESCAPING
| NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY)));
args = (const char *const*) args_array->pdata;
/* assert that the epected words are valid (and don't contain unescaped delimiters). */
for (i = 0; i < words_len; i++) {
const char *w = exp_words[i];
if (!escape && nmtst_get_rand_bool ())
words = nm_utils_strsplit_set (str, " \t\n");
else {
words = nm_utils_strsplit_set_full (str,
" \t\n",
escape
? NM_UTILS_STRSPLIT_SET_FLAGS_ALLOW_ESCAPING
: NM_UTILS_STRSPLIT_SET_FLAGS_NONE);
g_assert (w);
if (!f_preserve_empty)
g_assert (w[0]);
for (k = 0; w[k]; ) {
if ( f_allow_escaping
&& w[k] == '\\') {
k++;
if (w[k] == '\0')
break;
k++;
continue;
}
g_assert (!NM_IN_SET (w[k], DELIMITERS_C));
k++;
}
if (!f_allow_escaping)
g_assert (!NM_STRCHAR_ANY (w, ch, NM_IN_SET (ch, DELIMITERS_C)));
}
if (!args[0]) {
initial_offset = (f_preserve_empty || !str)
? 0u
: strspn (str, DELIMITERS);
/* first compare our expected values with what g_strsplit_set() would
* do. */
words_g = str ? g_strsplit_set (str, DELIMITERS, -1) : NULL;
if (str == NULL) {
g_assert_cmpint (words_len, ==, 0);
g_assert (!words_g);
} else if (nm_streq0 (str, "")) {
g_assert_cmpint (words_len, ==, 0);
g_assert (words_g);
g_assert (!words_g[0]);
} else {
g_assert (words_g);
g_assert (words_g[0]);
if (!f_allow_escaping) {
if (!f_preserve_empty) {
for (i = 0, j = 0; words_g[i]; i++) {
if (words_g[i][0] == '\0')
g_free (words_g[i]);
else
words_g[j++] = words_g[i];
}
words_g[j] = NULL;
}
if (f_preserve_empty)
g_assert_cmpint (words_len, >, 0);
for (i = 0; i < words_len; i++) {
g_assert (exp_words[i]);
g_assert_cmpstr (exp_words[i], ==, words_g[i]);
}
g_assert (words_g[words_len] == NULL);
g_assert_cmpint (NM_PTRARRAY_LEN (words_g), ==, words_len);
g_assert (_nm_utils_strv_cmp_n (exp_words, words_len, NM_CAST_STRV_CC (words_g), -1) == 0);
}
}
if ( flags == NM_UTILS_STRSPLIT_SET_FLAGS_NONE
&& nmtst_get_rand_bool ())
words = nm_utils_strsplit_set (str, DELIMITERS);
else if ( flags == NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY
&& nmtst_get_rand_bool ())
words = nm_utils_strsplit_set_with_empty (str, DELIMITERS);
else
words = nm_utils_strsplit_set_full (str, DELIMITERS, flags);
g_assert_cmpint (NM_PTRARRAY_LEN (words), ==, words_len);
if (words_len == 0) {
g_assert (!words);
g_assert ( !str
|| NM_STRCHAR_ALL (str, ch, NM_IN_SET (ch, ' ', '\t', '\n')));
|| NM_STRCHAR_ALL (str, ch, NM_IN_SET (ch, DELIMITERS_C)));
return;
}
g_assert (words);
for (i = 0; args[i] || words[i]; i++) {
g_assert (args[i]);
g_assert (words[i]);
g_assert (args[i][0]);
g_assert (escape || NM_STRCHAR_ALL (args[i], ch, !NM_IN_SET (ch, ' ', '\t', '\n')));
g_assert_cmpstr (args[i], ==, words[i]);
for (i = 0; i < words_len; i++)
g_assert_cmpstr (exp_words[i], ==, words[i]);
g_assert (words[words_len] == NULL);
g_assert (_nm_utils_strv_cmp_n (exp_words, words_len, words, -1) == 0);
s1 = words[0];
g_assert (s1 >= (char *) &words[words_len + 1]);
s1 = &s1[strlen (str)];
for (i = 1; i < words_len; i++) {
g_assert (&(words[i - 1])[strlen (words[i - 1])] < words[i]);
g_assert (words[i] <= s1);
}
/* while strsplit removes all delimiters, we can relatively easily find them
* in the original string. Assert that the original string and the pointer offsets
* of words correspond. In particular, find idx_delim_after and idx_delim_before
* to determine which delimiter was after/before a word. */
{
gsize idx_word_start;
gsize idx_delim_after_old = G_MAXSIZE;
idx_word_start = initial_offset;
for (i = 0; i < words_len; i++) {
const gsize l_i = strlen (words[i]);
gsize idx_delim_after;
gsize idx_delim_before;
/* find the delimiter *after* words[i]. We can do that by looking at the next
* word and calculating the pointer difference.
*
* The delimiter after the very last word is '\0' and requires strlen() to find. */
idx_delim_after = initial_offset + ((words[i] - words[0]) + l_i);
if (idx_delim_after != idx_word_start + l_i) {
g_assert (!f_preserve_empty);
g_assert_cmpint (idx_word_start + l_i, <, idx_delim_after);
idx_word_start = idx_delim_after - l_i;
}
if (i + 1 < words_len) {
gsize x = initial_offset + ((words[i + 1] - words[0]) - 1);
if (idx_delim_after != x) {
g_assert (!f_preserve_empty);
g_assert_cmpint (idx_delim_after, <, x);
for (k = idx_delim_after; k <= x; k++)
g_assert (NM_IN_SET (str[k], DELIMITERS_C));
}
g_assert (NM_IN_SET (str[idx_delim_after], DELIMITERS_C));
} else {
if (f_preserve_empty)
g_assert (NM_IN_SET (str[idx_delim_after], '\0'));
else
g_assert (NM_IN_SET (str[idx_delim_after], '\0', DELIMITERS_C));
}
/* find the delimiter *before* words[i]. */
if (i == 0) {
/* there is only a delimiter *before*, with !f_preserve_empty and leading
* delimiters. */
idx_delim_before = G_MAXSIZE;
if (initial_offset > 0) {
g_assert (!f_preserve_empty);
idx_delim_before = initial_offset - 1;
}
} else
idx_delim_before = initial_offset + (words[i] - words[0]) - 1;
if (idx_delim_before != G_MAXSIZE)
g_assert (NM_IN_SET (str[idx_delim_before], DELIMITERS_C));
if (idx_delim_after_old != idx_delim_before) {
g_assert (!f_preserve_empty);
if (i == 0) {
g_assert_cmpint (initial_offset, >, 0);
g_assert_cmpint (idx_delim_before, !=, G_MAXSIZE);
g_assert_cmpint (idx_delim_before, ==, initial_offset - 1);
} else {
g_assert_cmpint (idx_delim_after_old, !=, G_MAXSIZE);
g_assert_cmpint (idx_delim_before, !=, G_MAXSIZE);
g_assert_cmpint (idx_delim_after_old, <, idx_delim_before);
for (k = idx_delim_after_old; k <= idx_delim_before; k++)
g_assert (NM_IN_SET (str[k], DELIMITERS_C));
}
}
for (k = 0; k < l_i; ) {
if ( f_allow_escaping
&& str[idx_word_start + k] == '\\') {
k++;
if (k >= l_i)
break;
k++;
continue;
}
g_assert (!NM_IN_SET (str[idx_word_start + k], DELIMITERS_C));
k++;
}
g_assert (strncmp (words[i], &str[idx_word_start], l_i) == 0);
if (i > 0) {
const char *s = &(words[i - 1])[strlen (words[i - 1]) + 1];
if (s != words[i]) {
g_assert (!f_preserve_empty);
g_assert (s < words[i]);
}
}
idx_word_start += l_i + 1;
idx_delim_after_old = idx_delim_after;
}
}
}
#define do_test_nm_utils_strsplit_set(str, ...) \
_do_test_nm_utils_strsplit_set (str, ##__VA_ARGS__, NULL)
static void
_do_test_nm_utils_strsplit_set_f (NMUtilsStrsplitSetFlags flags,
const char *str,
gsize words_len,
const char *const*exp_words)
{
_do_test_nm_utils_strsplit_set_f_one (flags, str, words_len, exp_words);
if (NM_FLAGS_HAS (flags, NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY)) {
gs_unref_ptrarray GPtrArray *exp_words2 = NULL;
gsize k;
exp_words2 = g_ptr_array_new ();
for (k = 0; k < words_len; k++) {
if (exp_words[k][0] != '\0')
g_ptr_array_add (exp_words2, (gpointer) exp_words[k]);
}
_do_test_nm_utils_strsplit_set_f_one (flags & (~NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY),
str,
exp_words2->len,
(const char *const*) exp_words2->pdata);
}
}
#define do_test_nm_utils_strsplit_set_f(flags, str, ...) \
_do_test_nm_utils_strsplit_set_f (flags, \
str, \
NM_NARG (__VA_ARGS__), \
NM_MAKE_STRV (__VA_ARGS__))
#define do_test_nm_utils_strsplit_set(allow_escaping, str, ...) \
do_test_nm_utils_strsplit_set_f ( (allow_escaping) \
? NM_UTILS_STRSPLIT_SET_FLAGS_ALLOW_ESCAPING \
: NM_UTILS_STRSPLIT_SET_FLAGS_NONE, \
str, \
##__VA_ARGS__)
static void
test_nm_utils_strsplit_set (void)
{
gs_unref_ptrarray GPtrArray *words_exp = NULL;
guint test_run;
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_NONE, NULL);
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_NONE, "");
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_NONE, " ");
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_NONE, "a b", "a", "b");
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY, NULL);
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY, "");
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY, " ", "", "");
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY, " ", "", "", "");
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY, "a ", "a", "", "");
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY, "a b", "a", "", "b");
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY, " ab b", "", "ab", "", "b");
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY, "ab b", "ab", "", "b");
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY, "abb", "abb");
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY, "abb bb ", "abb", "", "bb", "");
do_test_nm_utils_strsplit_set_f (NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY, "abb bcb ", "abb", "bcb", "");
do_test_nm_utils_strsplit_set (FALSE, NULL);
do_test_nm_utils_strsplit_set (FALSE, "");
do_test_nm_utils_strsplit_set (FALSE, "\t");
@ -331,6 +555,61 @@ test_nm_utils_strsplit_set (void)
do_test_nm_utils_strsplit_set (TRUE, "foo\\", "foo\\");
do_test_nm_utils_strsplit_set (TRUE, "bar foo\\", "bar", "foo\\");
do_test_nm_utils_strsplit_set (TRUE, "\\ a b\\ \\ c", "\\ a", "b\\ \\ ", "c");
words_exp = g_ptr_array_new_with_free_func (g_free);
for (test_run = 0; test_run < 100; test_run++) {
gboolean f_allow_escaping = nmtst_get_rand_bool ();
guint words_len = nmtst_get_rand_int () % 100;
gs_free char *str = NULL;
guint i;
g_ptr_array_set_size (words_exp, 0);
for (i = 0; i < words_len; i++) {
guint word_len;
char *word;
guint j;
word_len = nmtst_get_rand_int ();
if ((word_len % 100) < 30)
word_len = 0;
else
word_len = (word_len >> 10) % 100;
word = g_new (char, word_len + 3);
for (j = 0; j < word_len; ) {
guint32 p = nmtst_get_rand_int ();
static const char delimiters_arr[] = { DELIMITERS_C };
static const char regular_chars[] = "abcdefghijklmnopqrstuvwxyz";
if ( !f_allow_escaping
|| (p % 1000) < 700) {
if (((p >> 20) % 100) < 20)
word[j++] = '\\';
word[j++] = regular_chars[(p >> 11) % (G_N_ELEMENTS (regular_chars) - 1)];
continue;
}
word[j++] = '\\';
word[j++] = delimiters_arr[(p >> 11) % G_N_ELEMENTS (delimiters_arr)];
}
word[j] = '\0';
g_ptr_array_add (words_exp, word);
}
g_ptr_array_add (words_exp, NULL);
str = g_strjoinv (" ", (char **) words_exp->pdata);
if ( str[0] == '\0'
&& words_len > 0) {
g_assert (words_len == 1);
g_assert_cmpstr (words_exp->pdata[0], ==, "");
words_len = 0;
}
_do_test_nm_utils_strsplit_set_f ( (f_allow_escaping ? NM_UTILS_STRSPLIT_SET_FLAGS_ALLOW_ESCAPING : NM_UTILS_STRSPLIT_SET_FLAGS_NONE)
| NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY,
str,
words_len,
(const char *const*) words_exp->pdata);
}
}
/*****************************************************************************/

View file

@ -989,17 +989,26 @@ _char_lookup_has (const guint8 lookup[static 256],
* each word once (the entire strv array), but instead copies it once
* and all words point into that internal copy.
*
* Another difference from g_strsplit_set() is that this never returns
* empty words. Multiple delimiters are combined and treated as one.
* Note that for @str %NULL and "", this always returns %NULL too. That differs
* from g_strsplit_set(), which would return an empty strv array for "".
*
* Note that g_strsplit_set() returns empty words as well. By default,
* nm_utils_strsplit_set_full() strips all empty tokens (that is, repeated
* delimiters. With %NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY, empty tokens
* are not removed.
*
* If @flags has %NM_UTILS_STRSPLIT_SET_FLAGS_ALLOW_ESCAPING, delimiters prefixed
* by a backslash are not treated as a separator. Such delimiters and their escape
* character are copied to the current word without unescaping them.
* character are copied to the current word without unescaping them. In general,
* nm_utils_strsplit_set_full() does not remove any backslash escape characters
* and does not unescaping. It only considers them for skipping to split at
* an escaped delimiter.
*
* Returns: %NULL if @str is %NULL or contains only delimiters.
* Otherwise, a %NULL terminated strv array containing non-empty
* words, split at the delimiter characters (delimiter characters
* are removed).
* Returns: %NULL if @str is %NULL or "".
* If @str only contains delimiters and %NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY
* is not set, it also returns %NULL.
* Otherwise, a %NULL terminated strv array containing the split words.
* (delimiter characters are removed).
* The strings to which the result strv array points to are allocated
* after the returned result itself. Don't free the strings themself,
* but free everything with g_free().
@ -1012,12 +1021,15 @@ nm_utils_strsplit_set_full (const char *str,
NMUtilsStrsplitSetFlags flags)
{
const char **ptr, **ptr0;
gsize alloc_size, plen, i;
gsize alloc_size;
gsize plen;
gsize i;
gsize str_len;
char *s0;
char *s;
guint8 ch_lookup[256];
const gboolean f_allow_escaping = NM_FLAGS_HAS (flags, NM_UTILS_STRSPLIT_SET_FLAGS_ALLOW_ESCAPING);
const gboolean f_preseve_empty = NM_FLAGS_HAS (flags, NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY);
if (!str)
return NULL;
@ -1031,11 +1043,18 @@ nm_utils_strsplit_set_full (const char *str,
nm_assert ( !f_allow_escaping
|| !_char_lookup_has (ch_lookup, '\\'));
while (_char_lookup_has (ch_lookup, str[0]))
str++;
if (!f_preseve_empty) {
while (_char_lookup_has (ch_lookup, str[0]))
str++;
}
if (!str[0])
if (!str[0]) {
/* We return %NULL here, also with NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY.
* That makes nm_utils_strsplit_set_full() with NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY
* different from g_strsplit_set(), which would in this case return an empty array.
* If you need to handle %NULL, and "" specially, then check the input string first. */
return NULL;
}
str_len = strlen (str) + 1;
alloc_size = 8;
@ -1065,7 +1084,12 @@ nm_utils_strsplit_set_full (const char *str,
ptr[plen++] = s;
nm_assert (s[0] && !_char_lookup_has (ch_lookup, s[0]));
if (s[0] == '\0') {
nm_assert (f_preseve_empty);
goto done;
}
nm_assert ( f_preseve_empty
|| !_char_lookup_has (ch_lookup, s[0]));
while (!_char_lookup_has (ch_lookup, s[0])) {
if (G_UNLIKELY ( s[0] == '\\'
@ -1083,10 +1107,12 @@ nm_utils_strsplit_set_full (const char *str,
nm_assert (_char_lookup_has (ch_lookup, s[0]));
s[0] = '\0';
s++;
while (_char_lookup_has (ch_lookup, s[0]))
s++;
if (s[0] == '\0')
goto done;
if (!f_preseve_empty) {
while (_char_lookup_has (ch_lookup, s[0]))
s++;
if (s[0] == '\0')
goto done;
}
}
done:

View file

@ -334,13 +334,25 @@ int nm_utils_dbus_path_cmp (const char *dbus_path_a, const char *dbus_path_b);
typedef enum {
NM_UTILS_STRSPLIT_SET_FLAGS_NONE = 0,
NM_UTILS_STRSPLIT_SET_FLAGS_ALLOW_ESCAPING = (1u << 0),
NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY = (1u << 0),
NM_UTILS_STRSPLIT_SET_FLAGS_ALLOW_ESCAPING = (1u << 1),
} NMUtilsStrsplitSetFlags;
const char **nm_utils_strsplit_set_full (const char *str,
const char *delimiter,
NMUtilsStrsplitSetFlags flags);
static inline const char **
nm_utils_strsplit_set_with_empty (const char *str,
const char *delimiters)
{
/* this returns the same result as g_strsplit_set(str, delimiters, -1), except
* it does not deep-clone the strv array.
* Also, for @str == "", this returns %NULL while g_strsplit_set() would return
* an empty strv array. */
return nm_utils_strsplit_set_full (str, delimiters, NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY);
}
static inline const char **
nm_utils_strsplit_set (const char *str,
const char *delimiters)