#41 locate: add --transliterate support using iconv to match accented
Opened a year ago by marcotrevisan. Modified a year ago
marcotrevisan/mlocate master  into  master

file modified
+1

@@ -1,1 +1,2 @@ 

  Miloslav Trmac <mitr@redhat.com>

+ Marco Trevisan <marco@ubuntu.com>

file modified
+9

@@ -39,6 +39,15 @@ 

  AM_GNU_GETTEXT([external], [need-ngettext])

  AM_GNU_GETTEXT_VERSION([0.18.2])

  

+ AC_ARG_ENABLE(iconv,

+     AC_HELP_STRING([--disable-iconv],

+                     [disable iconv support]),,

+     enable_iconv=yes)

+ 

+ if test x$enable_iconv = xyes; then

+     AM_ICONV

+ fi

+ 

  # Checks for header files.

  

  # Checks for types.

file modified
+6 -1

@@ -126,6 +126,10 @@ 

  Ignore case distinctions when matching patterns.

  

  .TP

+ \fB\-t\fR, \fB\-\-transliterate\fR

+ Ignore accents using iconv transliteration when matching patterns.

+ 

+ .TP

  \fB\-l\fR, \fB\-\-limit\fR, \fB\-n\fR \fILIMIT\fR

  Exit successfully after finding

  .I LIMIT

@@ -267,4 +271,5 @@ 

  Miloslav Trmac <mitr@redhat.com>

  

  .SH SEE ALSO

- .BR updatedb (8)

+ .BR updatedb (8),

+ .BR iconv (1),

file modified
+263 -1

@@ -22,6 +22,10 @@ 

  #include <errno.h>

  #include <fcntl.h>

  #include <grp.h>

+ #if HAVE_ICONV

+ #include <iconv.h>

+ #include <langinfo.h>

+ #endif

  #include <inttypes.h>

  #include <limits.h>

  #include <locale.h>

@@ -47,6 +51,9 @@ 

  #include "db.h"

  #include "lib.h"

  

+ #define BASIC_REGEX_META_CHARS ".^$*[]\\-"

+ #define EXTENDED_REGEX_META_CHARS BASIC_REGEX_META_CHARS "{}|+?()"

+ 

  /* Check file existence before reporting them */

  static bool conf_check_existence; /* = false; */

  

@@ -60,6 +67,9 @@ 

  /* Ignore case when matching patterns */

  static bool conf_ignore_case; /* = false; */

  

+ /* Ignore accents when matching patterns */

+ static bool conf_transliterate; /* = false; */

+ 

  /* Return only files that match all patterns */

  static bool conf_match_all_patterns; /* = false; */

  

@@ -108,6 +118,11 @@ 

  /* Output only statistics */

  static bool conf_statistics; /* = false; */

  

+ #if HAVE_ICONV

+ /* Iconv context for transliterate conversion */

+ static iconv_t iconv_context; /* = NULL; */

+ #endif

+ 

   /* String utilities */

  

  /* Convert SRC to upper-case wide string in OBSTACK;

@@ -163,6 +178,190 @@ 

    return res;

  }

  

+ #if HAVE_ICONV

+ static bool

+ char_needs_escape (const char c)

+ {

+    if (conf_match_regexp_basic != false &&

+        strchr (BASIC_REGEX_META_CHARS, c) != NULL)

+      return true;

+ 

+    if (conf_match_regexp_basic != true &&

+        strchr (EXTENDED_REGEX_META_CHARS, c) != NULL)

+      return true;

+ 

+    return false;

+ }

+ 

+ static char *

+ escape_regex (const char *str, size_t len, size_t *escaped_len)

+ {

+   size_t i, j;

+   size_t newlen;

+   bool foundmeta;

+   char *outbuf;

+ 

+   if (escaped_len)

+ 	*escaped_len = 0;

+ 

+   if (conf_match_regexp != true)

+ 	return NULL;

+ 

+   foundmeta = false;

+   newlen = 0;

+ 

+   for (i = 0; str[i] && i < len; ++i)

+     {

+       if (char_needs_escape (str[i]))

+ 	{

+ 	  foundmeta = true;

+ 	  ++newlen;

+ 	}

+       ++newlen;

+     }

+ 

+   if (foundmeta != true || newlen == 0)

+     return NULL;

+ 

+   outbuf = xmalloc (newlen + 1);

+   outbuf[newlen] = '\0';

+ 

+   for (i = 0, j = 0; i < len && j < newlen; ++i)

+     {

+       if (char_needs_escape (str[i]))

+ 	outbuf[j++] = '\\';

+       outbuf[j++] = str[i];

+     }

+ 

+   if (escaped_len)

+     *escaped_len = newlen;

+ 

+   return outbuf;

+ }

+ 

+ /* Use iconv to transliterate the string into ASCII chars, when possible.

+    If a transliteration does not exist, we just use the actual symbol

+    not to loose precision. */

+ static char *

+ transliterate_string (const char *str)

+ {

+   size_t strrlen;

+   size_t inlen;

+   size_t outleft;

+   size_t transliteratedlen;

+   size_t nonasciibytes;

+   size_t i;

+   bool changed;

+   char *inbuf;

+   char *outbuf;

+   char *outptr;

+ 

+   changed = false;

+   nonasciibytes = 0;

+   strrlen = 0;

+ 

+   for (i = 0; str[i]; i++)

+     {

+       if (str[i] & 0x80)

+ 	++nonasciibytes;

+ 

+       ++strrlen;

+     }

+ 

+   if (nonasciibytes < 1)

+     return NULL;

+ 

+   inbuf = (char *) str;

+   inlen = 1;

+   transliteratedlen = 0;

+   outleft = strrlen + nonasciibytes;

+   outbuf = xmalloc (outleft);

+   outptr = outbuf;

+ 

+   while (inbuf + inlen <= str + strrlen)

+     {

+       size_t convertedlen;

+       size_t conversions;

+       size_t symbollen;

+       size_t outidx;

+ 

+       symbollen = inlen;

+       conversions = iconv (iconv_context, &inbuf, &inlen, &outptr, &outleft);

+       outidx = outptr - outbuf;

+       convertedlen = outidx - transliteratedlen;

+ 

+       if (conversions == (size_t) -1)

+ 	{

+ 	  if (errno == EILSEQ || errno == EINVAL)

+ 	    {

+ 	      inlen += 1;

+ 	      continue;

+ 	    }

+ 	  else if (errno == E2BIG)

+ 	    {

+ 	      outleft += 5;

+ 	      outbuf = xrealloc (outbuf, outidx + outleft);

+ 	      outptr = outbuf + outidx;

+ 	      continue;

+ 	    }

+ 	  error (0, errno, _("Impossible to transliterate string %s"), str);

+ 	  changed = false;

+ 	  break;

+ 	}

+       else if (conversions == 1 && convertedlen == 1 && outptr[-1] == '?')

+ 	{

+ 	  /* Transliteration is not possible for this symbol, so we just

+ 	     reuse it as it is. */

+ 	  memcpy (outptr - 1, inbuf - symbollen, symbollen);

+ 	  convertedlen = symbollen;

+ 	  outptr += symbollen - 1;

+ 	  outleft -= symbollen - 1;

+ 	}

+       else if (conversions > 0)

+ 	{

+ 	  if (conf_match_regexp != false && convertedlen > 0)

+ 	    {

+ 	      char *converted;

+ 	      char *escaped;

+ 	      size_t escaped_len;

+ 

+ 	      converted = outptr - convertedlen;

+ 	      escaped = escape_regex (converted, convertedlen, &escaped_len);

+ 

+ 	      if (escaped)

+ 		{

+ 		  if (escaped_len > outleft)

+ 		    {

+ 		      outleft += (escaped_len - outleft);

+ 		      outbuf = xrealloc (outbuf, outidx + outleft);

+ 		      outptr = outbuf + outidx;

+ 		      converted = outptr - convertedlen;

+ 		    }

+ 		  memcpy (converted, escaped, escaped_len);

+ 		  free (escaped);

+ 

+ 		  outptr += (escaped_len - convertedlen);

+ 		  outleft -= (escaped_len - convertedlen);

+ 		  convertedlen = escaped_len;

+ 		}

+ 	    }

+ 	  changed = true;

+ 	}

+       transliteratedlen += convertedlen;

+       inlen = 1;

+     }

+ 

+   if (changed != true)

+     {

+       free (outbuf);

+       return NULL;

+     }

+ 

+   outbuf[transliteratedlen] = '\0';

+   return outbuf;

+ }

+ #endif

+ 

  /* Write STRING to stdout, replace unprintable characters with '?' */

  static void

  write_quoted (const char *string)

@@ -432,7 +631,28 @@ 

    else

      matching = path;

    if (!string_matches_pattern (matching))

+ #if !HAVE_ICONV

      goto done;

+ #else

+     {

+       bool matched;

+ 

+       matched = false;

+       if (conf_transliterate != false)

+ 	{

+ 	  char *transliterated;

+ 

+ 	  transliterated = transliterate_string (matching);

+ 	  if (transliterated)

+ 	    {

+ 	      matched = string_matches_pattern (transliterated);

+ 	      free (transliterated);

+ 	    }

+ 	}

+       if (!matched)

+ 	goto done;

+     }

+ #endif

    /* Visible? */

    if (*visible == -1)

      *visible = check_directory_perms (path) == 0;

@@ -632,6 +852,11 @@ 

  	    "  -h, --help             print this help\n"

  	    "  -i, --ignore-case      ignore case distinctions when matching "

  	    "patterns\n"

+ #if HAVE_ICONV

+ 	    "  -t, --transliterate    ignore accents using iconv "

+ 	    "transliteration when\n"

+ 	    "                         matching patterns\n"

+ #endif

  	    "  -l, --limit, -n LIMIT  limit output (or counting) to LIMIT "

  	    "entries\n"

  	    "  -m, --mmap             ignored, for backward compatibility\n"

@@ -669,6 +894,7 @@ 

        { "follow", no_argument, NULL, 'L' },

        { "help", no_argument, NULL, 'h' },

        { "ignore-case", no_argument, NULL, 'i' },

+       { "transliterate", no_argument, NULL, 't' },

        { "limit", required_argument, NULL, 'l' },

        { "mmap", no_argument, NULL, 'm' },

        { "quiet", no_argument, NULL, 'q' },

@@ -691,7 +917,7 @@ 

      {

        int opt, idx;

  

-       opt = getopt_long (argc, argv, "0AHPLSVbcd:ehil:mn:qr:sw", options, &idx);

+       opt = getopt_long (argc, argv, "0AHPLSVbcd:ehitl:mn:qr:sw", options, &idx);

        switch (opt)

  	{

  	case -1:

@@ -772,6 +998,10 @@ 

  	  conf_ignore_case = true;

  	  break;

  

+ 	case 't':

+ 	  conf_transliterate = true;

+ 	  break;

+ 

  	case 'l': case 'n':

  	  {

  	    char *end;

@@ -822,6 +1052,19 @@ 

      error (EXIT_FAILURE, 0,

  	   _("non-option arguments are not allowed with --%s"),

  	   conf_statistics != false ? "statistics" : "regexp");

+   if (conf_transliterate != false)

+     {

+ #if HAVE_ICONV

+       iconv_context = iconv_open ("ASCII//TRANSLIT", nl_langinfo (CODESET));

+       if (iconv_context == (iconv_t) -1)

+ 	  error (EXIT_FAILURE, errno, _("can not do transliteration between " \

+ 					"these locales: `%s' and `ASCII'"),

+ 					nl_langinfo (CODESET));

+ #else

+       error (EXIT_FAILURE, errno, _("transliteration support is not supported" \

+ 				    "by this build of %s"), program_name);

+ #endif

+     }

  }

  

  /* Parse arguments in ARGC, ARGV.  Exit on error. */

@@ -834,6 +1077,21 @@ 

      string_list_append (&conf_patterns, argv[i]);

    if (conf_statistics == false && conf_patterns.len == 0)

      error (EXIT_FAILURE, 0, _("no pattern to search for specified"));

+ #if HAVE_ICONV

+   if (conf_transliterate != false)

+     {

+       size_t patterns_len = conf_patterns.len;

+       char *transliterated;

+ 

+       for (i = 0; i < patterns_len; i++)

+ 	{

+ 	  transliterated = transliterate_string (conf_patterns.entries[i]);

+ 

+ 	  if (transliterated)

+ 	    string_list_append (&conf_patterns, transliterated);

+ 	}

+     }

+ #endif

    conf_patterns.entries = xnrealloc (conf_patterns.entries, conf_patterns.len,

  				     sizeof (*conf_patterns.entries));

    if (conf_match_regexp != false)

@@ -1042,6 +1300,10 @@ 

        handle_dbpath_entry (conf_dbpath.entries[i]);

      }

   done:

+ #if HAVE_ICONV

+   if (conf_transliterate != false && iconv_context)

+     iconv_close (iconv_context);

+ #endif

    if (conf_output_count != false)

      printf ("%ju\n", matches_found);

    if (conf_statistics != false || matches_found != 0)

file modified
+2

@@ -233,6 +233,8 @@ 

                           existence (default)

    -h, --help             print this help

    -i, --ignore-case      ignore case distinctions when matching patterns

+   -t, --transliterate    ignore accents using iconv transliteration when

+                          matching patterns

    -l, --limit, -n LIMIT  limit output (or counting) to LIMIT entries

    -m, --mmap             ignored, for backward compatibility

    -P, --nofollow, -H     don't follow trailing symbolic links when checking file

When enabled all search parameters and paths are transliterated and used for matching.

Fixes #40

3 new commits added

  • locate: don't even try to transliterate ascii strings
  • locate: transliterate strings by char and do it on valid results only
  • locate: ignore transliterated strings only with replacement chars
a year ago

rebased onto f96e90e

a year ago

6 new commits added

  • locate: don't even try to transliterate ascii strings
  • locate: transliterate strings by char and do it on valid results only
  • locate: ignore transliterated strings only with replacement chars
  • locate: only add and check transliterated patterns if needed
  • doc: add -t / --transliterate to man page
  • locate: add --transliterate support using iconv to match accented
a year ago

3 new commits added

  • locate: allocate less space for transliterated, realloc if needed
  • locate: only allocate memory if needed in transliteration
  • locate: take in account inlen value when transliterating string
a year ago

10 new commits added

  • locate: escape transliterated meta chars if regex is enabled
  • locate: allocate less space for transliterated, realloc if needed
  • locate: only allocate memory if needed in transliteration
  • locate: take in account inlen value when transliterating string
  • locate: don't even try to transliterate ascii strings
  • locate: transliterate strings by char and do it on valid results only
  • locate: ignore transliterated strings only with replacement chars
  • locate: only add and check transliterated patterns if needed
  • doc: add -t / --transliterate to man page
  • locate: add --transliterate support using iconv to match accented
a year ago

1 new commit added

  • locate: update test to match new help
a year ago