shared: add NM_UTILS_STRSPLIT_SET_FLAGS_STRSTRIP to nm_utils_strsplit_set_full()

This will essentially call g_strstrip() on each token.

There are some specialties:

 - if the resulting word is empty after stripping, then according to
   %NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY, the empty token will be
   removed. If that results in an empty string array, %NULL will be
   returned.

 - if %NM_UTILS_STRSPLIT_SET_FLAGS_ALLOW_ESCAPING is set, then
   whitespace that is backslash escaped is not removed.

Since this is a post-operation that happens after tokeninzing, it
could be done as a separate function. And we already have this function:
_nm_utils_unescape_plain() and _nm_utils_unescape_spaces().
However, that is ugly for several reasons:

 - the stripping should be part of the tokenizing, you shouldn't need
   several steps.

 - nm_utils_strsplit_set_full() returns a "const char **" which
   indicates the strings must not be freed. However, it is perfectly
   valid to modify the string inplace. Hence, the post-op function
   would need to cast the strings to "char *", which seems ugly
   (although we do that on many places, and it's guaranteed to work).

 - _nm_utils_unescape_plain()/_nm_utils_unescape_spaces() is indeed
   already used together with nm_utils_strsplit_set_full(). However,
   it requires to initialize the cb_lookup buffer twice. I would expect
   that initializing the cb_lookup buffer is a large portion of what
   the function does already (for short strings).
   This issue will be solved in the next commit by adding yet another flag
   which allows to unescape.

(cherry picked from commit 5b2b0dcadf)
This commit is contained in:
Thomas Haller 2019-04-11 13:32:43 +02:00
parent 53ab539dd1
commit c75a1d7e16
2 changed files with 66 additions and 17 deletions

View file

@ -1028,7 +1028,8 @@ nm_utils_strsplit_set_full (const char *str,
char *s;
guint8 ch_lookup[256];
const gboolean f_allow_escaping = NM_FLAGS_HAS (flags, NM_UTILS_STRSPLIT_SET_FLAGS_ALLOW_ESCAPING);
const gboolean f_preseve_empty = NM_FLAGS_HAS (flags, NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY);
const gboolean f_preserve_empty = NM_FLAGS_HAS (flags, NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY);
const gboolean f_strstrip = NM_FLAGS_HAS (flags, NM_UTILS_STRSPLIT_SET_FLAGS_STRSTRIP);
if (!str)
return NULL;
@ -1042,7 +1043,7 @@ nm_utils_strsplit_set_full (const char *str,
nm_assert ( !f_allow_escaping
|| !_char_lookup_has (ch_lookup, '\\'));
if (!f_preseve_empty) {
if (!f_preserve_empty) {
while (_char_lookup_has (ch_lookup, str[0]))
str++;
}
@ -1055,6 +1056,17 @@ nm_utils_strsplit_set_full (const char *str,
return NULL;
}
#define _char_is_escaped(str_start, str_cur) \
({ \
const char *const _str_start = (str_start); \
const char *const _str_cur = (str_cur); \
const char *_str_i = (_str_cur); \
\
while ( _str_i > _str_start \
&& _str_i[-1] == '\\') \
_str_i--; \
(((_str_cur - _str_i) % 2) != 0); \
})
num_tokens = 1;
c_str = str;
@ -1069,23 +1081,17 @@ nm_utils_strsplit_set_full (const char *str,
/* we assume escapings are not frequent. After we found
* this delimiter, check whether it was escaped by counting
* the backslashed before. */
if (f_allow_escaping) {
const char *c2 = c_str;
while ( c2 > str
&& c2[-1] == '\\')
c2--;
if (((c_str - c2) % 2) != 0) {
/* the delimiter is escaped. This was not an accepted delimiter. */
c_str++;
continue;
}
if ( f_allow_escaping
&& _char_is_escaped (str, c_str)) {
/* the delimiter is escaped. This was not an accepted delimiter. */
c_str++;
continue;
}
c_str++;
/* if we drop empty tokens, then we now skip over all consecutive delimiters. */
if (!f_preseve_empty) {
if (!f_preserve_empty) {
while (_char_lookup_has (ch_lookup, c_str[0]))
c_str++;
if (c_str[0] == '\0')
@ -1115,10 +1121,10 @@ done1:
ptr[i_token++] = s;
if (s[0] == '\0') {
nm_assert (f_preseve_empty);
nm_assert (f_preserve_empty);
goto done2;
}
nm_assert ( f_preseve_empty
nm_assert ( f_preserve_empty
|| !_char_lookup_has (ch_lookup, s[0]));
while (!_char_lookup_has (ch_lookup, s[0])) {
@ -1138,7 +1144,7 @@ done1:
s[0] = '\0';
s++;
if (!f_preseve_empty) {
if (!f_preserve_empty) {
while (_char_lookup_has (ch_lookup, s[0]))
s++;
if (s[0] == '\0')
@ -1150,6 +1156,38 @@ done2:
nm_assert (i_token == num_tokens);
ptr[i_token] = NULL;
if (f_strstrip) {
gsize i;
i_token = 0;
for (i = 0; ptr[i]; i++) {
s = (char *) nm_str_skip_leading_spaces (ptr[i]);
if (s[0] != '\0') {
char *s_last;
s_last = &s[strlen (s) - 1];
while ( s_last > s
&& g_ascii_isspace (s_last[0])
&& ( ! f_allow_escaping
|| !_char_is_escaped (s, s_last)))
(s_last--)[0] = '\0';
}
if ( !f_preserve_empty
&& s[0] == '\0')
continue;
ptr[i_token++] = s;
}
if (i_token == 0) {
g_free (ptr);
return NULL;
}
ptr[i_token] = NULL;
}
return ptr;
}

View file

@ -336,6 +336,17 @@ typedef enum {
NM_UTILS_STRSPLIT_SET_FLAGS_NONE = 0,
NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY = (1u << 0),
NM_UTILS_STRSPLIT_SET_FLAGS_ALLOW_ESCAPING = (1u << 1),
/* If flag is set, does the same as g_strstrip() on the returned tokens.
* This will remove leading and trailing ascii whitespaces (g_ascii_isspace()
* and NM_ASCII_SPACES).
*
* - when combined with !%NM_UTILS_STRSPLIT_SET_FLAGS_PRESERVE_EMPTY,
* empty tokens will be removed (and %NULL will be returned if that
* results in an empty string array).
* - when combined with %NM_UTILS_STRSPLIT_SET_FLAGS_ALLOW_ESCAPING,
* trailing whitespace escaped by backslash are not stripped. */
NM_UTILS_STRSPLIT_SET_FLAGS_STRSTRIP = (1u << 2),
} NMUtilsStrsplitSetFlags;
const char **nm_utils_strsplit_set_full (const char *str,