From 467718330ad4d6a406161932457e4f4dc71962c2 Mon Sep 17 00:00:00 2001 From: Ralph Bean Date: May 23 2016 12:12:46 +0000 Subject: Messing around with the distance module. Too slow\! --- diff --git a/muster.py b/muster.py index babf845..e41c321 100755 --- a/muster.py +++ b/muster.py @@ -1,8 +1,31 @@ #!/usr/bin/env python3 +# This file is part of muster. +# Copyright (C) 2016 Red Hat, Inc. +# +# muster is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# muster is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with muster; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +# +# Authors: Ralph Bean +# +import chardet +import distance import pygments.lexers +import os import sys +import time def extract_comments(filename): with open(filename, 'rb') as f: @@ -11,19 +34,63 @@ def extract_comments(filename): content = content.decode('utf-8') lexer = pygments.lexers.guess_lexer_for_filename(filename, content) tokens = lexer.get_tokens_unprocessed(content) + + retval = [] for idx, kind, value in tokens: + if 'Text' in tuple(kind) and value == u'\n': + continue if 'Comment' in tuple(kind): - yield value + retval += [value.strip()] + if 'Comment' not in tuple(kind) and retval: + yield "\n".join(retval) + retval = [] + if retval: + yield "\n".join(retval) +def load_licenses(directory): + for filename in os.listdir(directory): + if filename.endswith('.txt'): + name = filename.strip('.txt') + with open(os.path.join(directory, filename), 'rb') as f: + data = f.read() + encoding = chardet.detect(data) + yield name, data.decode(encoding['encoding']) def main(args): filename = args[-1] # TODO - do this nicely. + license_path = os.path.expanduser('~/scratch/oslc-3.0-4.src/licenses/') + print("Loading licenses...") + licenses = dict(load_licenses(license_path)) + minsize = min([len(l) for l in licenses.values()]) + print("Done loading licenses... (%i found)" % len(licenses)) print("Processing %s" % filename) comments = extract_comments(filename) + results = [] for comment in comments: - print(comment) + size = len(comment) + for name, fulltext in licenses.items(): + # First, an optimization. If the two things we're going to compare + # are sufficiently different in size, then just bail. There's no + # way they're the same thing. + N = len(fulltext) + if abs(size - len(fulltext)) > min([size, fulltext]): + # DEBUG skipping + continue + + normalizer = float(max([size, N])) + cutoff = min([size, N]) / 3 + score = distance.levenshtein(comment, fulltext, max_dist=cutoff) + if score < 0: + continue + print("%r on %r gives \t%r\t%r\t%r" % (comment[:20], name, score, N, size)) + results.append((score, name,)) + + results.sort(lambda a, b: cmp(a[0], b[0])) + + for result in results[:10]: + print(result) if __name__ == '__main__': main(sys.argv)