#42 locate: add --ignore-spaces option to ignore word separators
Opened a year ago by marcotrevisan. Modified a year ago
marcotrevisan/mlocate ignore-separators  into  master

file modified
+1

@@ -1,1 +1,2 @@ 

  Miloslav Trmac <mitr@redhat.com>

+ Marco Trevisan <marco@ubuntu.com>

file modified
+9

@@ -39,6 +39,15 @@ 

  AM_GNU_GETTEXT([external], [need-ngettext])

  AM_GNU_GETTEXT_VERSION([0.18.2])

  

+ AC_ARG_ENABLE(iconv,

+     AC_HELP_STRING([--disable-iconv],

+                     [disable iconv support]),,

+     enable_iconv=yes)

+ 

+ if test x$enable_iconv = xyes; then

+     AM_ICONV

+ fi

+ 

  # Checks for header files.

  

  # Checks for types.

file modified
+10 -1

@@ -126,6 +126,14 @@ 

  Ignore case distinctions when matching patterns.

  

  .TP

+ \fB\-p\fR, \fB\-\-ignore\-spaces\fR

+ Ignore punctuation and spaces when matching patterns.

+ 

+ .TP

+ \fB\-t\fR, \fB\-\-transliterate\fR

+ Ignore accents using iconv transliteration when matching patterns.

+ 

+ .TP

  \fB\-l\fR, \fB\-\-limit\fR, \fB\-n\fR \fILIMIT\fR

  Exit successfully after finding

  .I LIMIT

@@ -267,4 +275,5 @@ 

  Miloslav Trmac <mitr@redhat.com>

  

  .SH SEE ALSO

- .BR updatedb (8)

+ .BR updatedb (8),

+ .BR iconv (1),

file modified
+344 -3

@@ -22,6 +22,10 @@ 

  #include <errno.h>

  #include <fcntl.h>

  #include <grp.h>

+ #if HAVE_ICONV

+ #include <iconv.h>

+ #include <langinfo.h>

+ #endif

  #include <inttypes.h>

  #include <limits.h>

  #include <locale.h>

@@ -47,6 +51,10 @@ 

  #include "db.h"

  #include "lib.h"

  

+ #define FNMATCH_CHARS "*?[\\]"

+ #define BASIC_REGEX_META_CHARS ".^$*[]\\-"

+ #define EXTENDED_REGEX_META_CHARS BASIC_REGEX_META_CHARS "{}|+?()"

+ 

  /* Check file existence before reporting them */

  static bool conf_check_existence; /* = false; */

  

@@ -60,6 +68,12 @@ 

  /* Ignore case when matching patterns */

  static bool conf_ignore_case; /* = false; */

  

+ /* Ignore accents when matching patterns */

+ static bool conf_transliterate; /* = false; */

+ 

+ /* Ignore puncts and spaces when matching patterns */

+ static bool conf_ingore_separators; /* = false; */

+ 

  /* Return only files that match all patterns */

  static bool conf_match_all_patterns; /* = false; */

  

@@ -108,6 +122,11 @@ 

  /* Output only statistics */

  static bool conf_statistics; /* = false; */

  

+ #if HAVE_ICONV

+ /* Iconv context for transliterate conversion */

+ static iconv_t iconv_context; /* = NULL; */

+ #endif

+ 

   /* String utilities */

  

  /* Convert SRC to upper-case wide string in OBSTACK;

@@ -163,6 +182,242 @@ 

    return res;

  }

  

+ #if HAVE_ICONV

+ static bool

+ char_needs_escape (const char c)

+ {

+    if (conf_match_regexp_basic != false &&

+        strchr (BASIC_REGEX_META_CHARS, c) != NULL)

+      return true;

+ 

+    if (conf_match_regexp_basic != true &&

+        strchr (EXTENDED_REGEX_META_CHARS, c) != NULL)

+      return true;

+ 

+    return false;

+ }

+ 

+ static char *

+ escape_regex (const char *str, size_t len, size_t *escaped_len)

+ {

+   size_t i, j;

+   size_t newlen;

+   bool foundmeta;

+   char *outbuf;

+ 

+   if (escaped_len)

+ 	*escaped_len = 0;

+ 

+   if (conf_match_regexp != true)

+ 	return NULL;

+ 

+   foundmeta = false;

+   newlen = 0;

+ 

+   for (i = 0; str[i] && i < len; ++i)

+     {

+       if (char_needs_escape (str[i]))

+ 	{

+ 	  foundmeta = true;

+ 	  ++newlen;

+ 	}

+       ++newlen;

+     }

+ 

+   if (foundmeta != true || newlen == 0)

+     return NULL;

+ 

+   outbuf = xmalloc (newlen + 1);

+   outbuf[newlen] = '\0';

+ 

+   for (i = 0, j = 0; i < len && j < newlen; ++i)

+     {

+       if (char_needs_escape (str[i]))

+ 	outbuf[j++] = '\\';

+       outbuf[j++] = str[i];

+     }

+ 

+   if (escaped_len)

+     *escaped_len = newlen;

+ 

+   return outbuf;

+ }

+ 

+ /* Use iconv to transliterate the string into ASCII chars, when possible.

+    If a transliteration does not exist, we just use the actual symbol

+    not to loose precision. */

+ static char *

+ transliterate_string (const char *str)

+ {

+   size_t strrlen;

+   size_t inlen;

+   size_t outleft;

+   size_t transliteratedlen;

+   size_t nonasciibytes;

+   size_t i;

+   bool changed;

+   char *inbuf;

+   char *outbuf;

+   char *outptr;

+ 

+   changed = false;

+   nonasciibytes = 0;

+   strrlen = 0;

+ 

+   for (i = 0; str[i]; i++)

+     {

+       if (str[i] & 0x80)

+ 	++nonasciibytes;

+ 

+       ++strrlen;

+     }

+ 

+   if (nonasciibytes < 1)

+     return NULL;

+ 

+   inbuf = (char *) str;

+   inlen = 1;

+   transliteratedlen = 0;

+   outleft = strrlen + nonasciibytes;

+   outbuf = xmalloc (outleft);

+   outptr = outbuf;

+ 

+   while (inbuf + inlen <= str + strrlen)

+     {

+       size_t convertedlen;

+       size_t conversions;

+       size_t symbollen;

+       size_t outidx;

+ 

+       symbollen = inlen;

+       conversions = iconv (iconv_context, &inbuf, &inlen, &outptr, &outleft);

+       outidx = outptr - outbuf;

+       convertedlen = outidx - transliteratedlen;

+ 

+       if (conversions == (size_t) -1)

+ 	{

+ 	  if (errno == EILSEQ || errno == EINVAL)

+ 	    {

+ 	      inlen += 1;

+ 	      continue;

+ 	    }

+ 	  else if (errno == E2BIG)

+ 	    {

+ 	      outleft += 5;

+ 	      outbuf = xrealloc (outbuf, outidx + outleft);

+ 	      outptr = outbuf + outidx;

+ 	      continue;

+ 	    }

+ 	  error (0, errno, _("Impossible to transliterate string %s"), str);

+ 	  changed = false;

+ 	  break;

+ 	}

+       else if (conversions == 1 && convertedlen == 1 && outptr[-1] == '?')

+ 	{

+ 	  /* Transliteration is not possible for this symbol, so we just

+ 	     reuse it as it is. */

+ 	  memcpy (outptr - 1, inbuf - symbollen, symbollen);

+ 	  convertedlen = symbollen;

+ 	  outptr += symbollen - 1;

+ 	  outleft -= symbollen - 1;

+ 	}

+       else if (conversions > 0)

+ 	{

+ 	  if (conf_match_regexp != false && convertedlen > 0)

+ 	    {

+ 	      char *converted;

+ 	      char *escaped;

+ 	      size_t escaped_len;

+ 

+ 	      converted = outptr - convertedlen;

+ 	      escaped = escape_regex (converted, convertedlen, &escaped_len);

+ 

+ 	      if (escaped)

+ 		{

+ 		  if (escaped_len > outleft)

+ 		    {

+ 		      outleft += (escaped_len - outleft);

+ 		      outbuf = xrealloc (outbuf, outidx + outleft);

+ 		      outptr = outbuf + outidx;

+ 		      converted = outptr - convertedlen;

+ 		    }

+ 		  memcpy (converted, escaped, escaped_len);

+ 		  free (escaped);

+ 

+ 		  outptr += (escaped_len - convertedlen);

+ 		  outleft -= (escaped_len - convertedlen);

+ 		  convertedlen = escaped_len;

+ 		}

+ 	    }

+ 	  changed = true;

+ 	}

+       transliteratedlen += convertedlen;

+       inlen = 1;

+     }

+ 

+   if (changed != true)

+     {

+       free (outbuf);

+       return NULL;

+     }

+ 

+   outbuf[transliteratedlen] = '\0';

+   return outbuf;

+ }

+ #endif

+ 

+ /* Remove repeated punct or spaces from string and replaces

+    them using a space*/

+ static char *

+ compress_string_separators (const char *str, bool is_pattern)

+ {

+   size_t strippedlen;

+   size_t i;

+   char *outbuf;

+   bool first;

+   bool changed;

+ 

+   changed = false;

+   first = false;

+   strippedlen = 0;

+   outbuf = xmalloc (strlen (str) + 1);

+ 

+     for (i = 0; str[i]; i++)

+       {

+ 	char cnt;

+ 

+ 	cnt = str[i];

+ 	if (isspace (cnt) || (ispunct (cnt) && cnt != '@' &&

+ 			      (!is_pattern || strchr (FNMATCH_CHARS, cnt) == NULL)))

+ 	  {

+ 	    if (first != false)

+ 	      {

+ 		changed = true;

+ 		continue;

+ 	      }

+ 	    if (cnt != ' ')

+ 	      {

+ 		cnt = ' ';

+ 		changed = true;

+ 	      }

+ 	    first = true;

+ 	  }

+ 	else

+ 	  first = false;

+ 

+ 	outbuf[strippedlen++] = cnt;

+       }

+ 

+     if (!changed)

+       {

+ 	free (outbuf);

+ 	return NULL;

+       }

+ 

+     outbuf[strippedlen] = '\0';

+     return outbuf;

+ }

+ 

  /* Write STRING to stdout, replace unprintable characters with '?' */

  static void

  write_quoted (const char *string)

@@ -432,7 +687,32 @@ 

    else

      matching = path;

    if (!string_matches_pattern (matching))

-     goto done;

+     {

+       char *altered_matching;

+       bool matched;

+ 

+       altered_matching = NULL;

+       matched = false;

+       if (conf_ingore_separators != false)

+ 	altered_matching = compress_string_separators (matching, false);

+ #if HAVE_ICONV

+       if (conf_transliterate != false)

+ 	{

+ 	  char *old_altered = altered_matching;

+ 	  if (altered_matching)

+ 	    matching = altered_matching;

+ 	  altered_matching = transliterate_string (matching);

+ 	  free (old_altered);

+ 	}

+ #endif

+       if (altered_matching != NULL)

+ 	{

+ 	  matched = string_matches_pattern (altered_matching);

+ 	  free (altered_matching);

+ 	}

+       if (!matched)

+ 	goto done;

+     }

    /* Visible? */

    if (*visible == -1)

      *visible = check_directory_perms (path) == 0;

@@ -632,6 +912,13 @@ 

  	    "  -h, --help             print this help\n"

  	    "  -i, --ignore-case      ignore case distinctions when matching "

  	    "patterns\n"

+ 	    "  -p, --ignore-spaces    ignore punctuation and spaces when "

+ 	    "matching patterns\n"

+ #if HAVE_ICONV

+ 	    "  -t, --transliterate    ignore accents using iconv "

+ 	    "transliteration when\n"

+ 	    "                         matching patterns\n"

+ #endif

  	    "  -l, --limit, -n LIMIT  limit output (or counting) to LIMIT "

  	    "entries\n"

  	    "  -m, --mmap             ignored, for backward compatibility\n"

@@ -669,6 +956,8 @@ 

        { "follow", no_argument, NULL, 'L' },

        { "help", no_argument, NULL, 'h' },

        { "ignore-case", no_argument, NULL, 'i' },

+       { "ignore-spaces", no_argument, NULL, 'p' },

+       { "transliterate", no_argument, NULL, 't' },

        { "limit", required_argument, NULL, 'l' },

        { "mmap", no_argument, NULL, 'm' },

        { "quiet", no_argument, NULL, 'q' },

@@ -691,7 +980,7 @@ 

      {

        int opt, idx;

  

-       opt = getopt_long (argc, argv, "0AHPLSVbcd:ehil:mn:qr:sw", options, &idx);

+       opt = getopt_long (argc, argv, "0AHPLSVbcd:ehitpl:mn:qr:sw", options, &idx);

        switch (opt)

  	{

  	case -1:

@@ -772,6 +1061,14 @@ 

  	  conf_ignore_case = true;

  	  break;

  

+ 	case 't':

+ 	  conf_transliterate = true;

+ 	  break;

+ 

+ 	case 'p':

+ 	  conf_ingore_separators = true;

+ 	  break;

+ 

  	case 'l': case 'n':

  	  {

  	    char *end;

@@ -822,6 +1119,22 @@ 

      error (EXIT_FAILURE, 0,

  	   _("non-option arguments are not allowed with --%s"),

  	   conf_statistics != false ? "statistics" : "regexp");

+   if (conf_ingore_separators != false && conf_match_regexp != false)

+     error (EXIT_FAILURE, 0,

+ 	   _("ignore-spaces is not supported when using regexp"));

+   if (conf_transliterate != false)

+     {

+ #if HAVE_ICONV

+       iconv_context = iconv_open ("ASCII//TRANSLIT", nl_langinfo (CODESET));

+       if (iconv_context == (iconv_t) -1)

+ 	  error (EXIT_FAILURE, errno, _("can not do transliteration between " \

+ 					"these locales: `%s' and `ASCII'"),

+ 					nl_langinfo (CODESET));

+ #else

+       error (EXIT_FAILURE, errno, _("transliteration support is not supported" \

+ 				    "by this build of %s"), program_name);

+ #endif

+     }

  }

  

  /* Parse arguments in ARGC, ARGV.  Exit on error. */

@@ -834,6 +1147,30 @@ 

      string_list_append (&conf_patterns, argv[i]);

    if (conf_statistics == false && conf_patterns.len == 0)

      error (EXIT_FAILURE, 0, _("no pattern to search for specified"));

+   if (conf_transliterate != false || conf_ingore_separators != false)

+     {

+       char *altered_pattern;

+       size_t patterns_len = conf_patterns.len;

+ 

+       for (i = 0; i < patterns_len; i++)

+ 	{

+ 	  if (conf_ingore_separators != false)

+ 	    {

+ 	      altered_pattern =

+ 		compress_string_separators (conf_patterns.entries[i], true);

+ 	      if (altered_pattern)

+ 		conf_patterns.entries[i] = altered_pattern;

+ 	    }

+ #if HAVE_ICONV

+ 	  if (conf_transliterate != false)

+ 	    {

+ 	      altered_pattern = transliterate_string (conf_patterns.entries[i]);

+ 	      if (altered_pattern)

+ 		string_list_append (&conf_patterns, altered_pattern);

+ 	    }

+ #endif

+ 	}

+     }

    conf_patterns.entries = xnrealloc (conf_patterns.entries, conf_patterns.len,

  				     sizeof (*conf_patterns.entries));

    if (conf_match_regexp != false)

@@ -871,7 +1208,7 @@ 

        for (i = 0; i < conf_patterns.len; i++)

  	{

  	  conf_patterns_simple[i] = strpbrk (conf_patterns.entries[i],

- 					     "*?[\\]") == NULL;

+ 					     FNMATCH_CHARS) == NULL;

  	  if (conf_patterns_simple[i] != false)

  	    conf_have_simple_pattern = true;

  	}

@@ -1042,6 +1379,10 @@ 

        handle_dbpath_entry (conf_dbpath.entries[i]);

      }

   done:

+ #if HAVE_ICONV

+   if (conf_transliterate != false && iconv_context)

+     iconv_close (iconv_context);

+ #endif

    if (conf_output_count != false)

      printf ("%ju\n", matches_found);

    if (conf_statistics != false || matches_found != 0)

file modified
+3

@@ -233,6 +233,9 @@ 

                           existence (default)

    -h, --help             print this help

    -i, --ignore-case      ignore case distinctions when matching patterns

+   -p, --ignore-spaces    ignore punctuation and spaces when matching patterns

+   -t, --transliterate    ignore accents using iconv transliteration when

+                          matching patterns

    -l, --limit, -n LIMIT  limit output (or counting) to LIMIT entries

    -m, --mmap             ignored, for backward compatibility

    -P, --nofollow, -H     don't follow trailing symbolic links when checking file

It allows to locate in files ignoring in-words separators, such as punctuation and spaces, so searching "foo-bar" is like searching for "foo bar", "foo_bar" or "foo?bar".

This is not compatible with regex.

This PR has https://pagure.io/mlocate/pull-request/41 as perequisite.

11 new commits added

  • locate: add --ignore-spaces option to ignore word separators
  • locate: escape transliterated meta chars if regex is enabled
  • locate: allocate less space for transliterated, realloc if needed
  • locate: only allocate memory if needed in transliteration
  • locate: take in account inlen value when transliterating string
  • locate: don't even try to transliterate ascii strings
  • locate: transliterate strings by char and do it on valid results only
  • locate: ignore transliterated strings only with replacement chars
  • locate: only add and check transliterated patterns if needed
  • doc: add -t / --transliterate to man page
  • locate: add --transliterate support using iconv to match accented
a year ago

12 new commits added

  • locate: add --ignore-spaces option to ignore word separators
  • locate: update test to match new help
  • locate: escape transliterated meta chars if regex is enabled
  • locate: allocate less space for transliterated, realloc if needed
  • locate: only allocate memory if needed in transliteration
  • locate: take in account inlen value when transliterating string
  • locate: don't even try to transliterate ascii strings
  • locate: transliterate strings by char and do it on valid results only
  • locate: ignore transliterated strings only with replacement chars
  • locate: only add and check transliterated patterns if needed
  • doc: add -t / --transliterate to man page
  • locate: add --transliterate support using iconv to match accented
a year ago

12 new commits added

  • locate: add --ignore-spaces option to ignore word separators
  • locate: update test to match new help
  • locate: escape transliterated meta chars if regex is enabled
  • locate: allocate less space for transliterated, realloc if needed
  • locate: only allocate memory if needed in transliteration
  • locate: take in account inlen value when transliterating string
  • locate: don't even try to transliterate ascii strings
  • locate: transliterate strings by char and do it on valid results only
  • locate: ignore transliterated strings only with replacement chars
  • locate: only add and check transliterated patterns if needed
  • doc: add -t / --transliterate to man page
  • locate: add --transliterate support using iconv to match accented
a year ago