From 467718330ad4d6a406161932457e4f4dc71962c2 Mon Sep 17 00:00:00 2001
From: Ralph Bean <rbean@redhat.com>
Date: May 23 2016 12:12:46 +0000
Subject: Messing around with the distance module.  Too slow\!


---

diff --git a/muster.py b/muster.py
index babf845..e41c321 100755
--- a/muster.py
+++ b/muster.py
@@ -1,8 +1,31 @@
 #!/usr/bin/env python3
+# This file is part of muster.
+# Copyright (C) 2016 Red Hat, Inc.
+#
+# muster is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# muster is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with muster; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+#
+# Authors:  Ralph Bean <rbean@redhat.com>
+#
 
+import chardet
+import distance
 import pygments.lexers
 
+import os
 import sys
+import time
 
 def extract_comments(filename):
     with open(filename, 'rb') as f:
@@ -11,19 +34,63 @@ def extract_comments(filename):
     content = content.decode('utf-8')
     lexer = pygments.lexers.guess_lexer_for_filename(filename, content)
     tokens = lexer.get_tokens_unprocessed(content)
+
+    retval = []
     for idx, kind, value in tokens:
+        if 'Text' in tuple(kind) and value == u'\n':
+            continue
         if 'Comment' in tuple(kind):
-            yield value
+            retval += [value.strip()]
+        if 'Comment' not in tuple(kind) and retval:
+            yield "\n".join(retval)
+            retval = []
 
+    if retval:
+        yield "\n".join(retval)
 
+def load_licenses(directory):
+    for filename in os.listdir(directory):
+        if filename.endswith('.txt'):
+            name = filename.strip('.txt')
+            with open(os.path.join(directory, filename), 'rb') as f:
+                data = f.read()
+            encoding = chardet.detect(data)
+            yield name, data.decode(encoding['encoding'])
 
 def main(args):
     filename = args[-1]  # TODO - do this nicely.
+    license_path = os.path.expanduser('~/scratch/oslc-3.0-4.src/licenses/')
+    print("Loading licenses...")
+    licenses = dict(load_licenses(license_path))
+    minsize = min([len(l) for l in licenses.values()])
+    print("Done loading licenses... (%i found)" % len(licenses))
 
     print("Processing %s" % filename)
     comments = extract_comments(filename)
+    results = []
     for comment in comments:
-        print(comment)
+        size = len(comment)
+        for name, fulltext in licenses.items():
+            # First, an optimization.  If the two things we're going to compare
+            # are sufficiently different in size, then just bail.  There's no
+            # way they're the same thing.
+            N = len(fulltext)
+            if abs(size - len(fulltext)) > min([size, fulltext]):
+                # DEBUG skipping
+                continue
+
+            normalizer = float(max([size, N]))
+            cutoff = min([size, N]) / 3
+            score = distance.levenshtein(comment, fulltext, max_dist=cutoff)
+            if score < 0:
+                continue
+            print("%r on %r gives \t%r\t%r\t%r" % (comment[:20], name, score, N, size))
+            results.append((score, name,))
+
+    results.sort(lambda a, b: cmp(a[0], b[0]))
+
+    for result in results[:10]:
+        print(result)
 
 if __name__ == '__main__':
     main(sys.argv)