#112 Quick and dirty license and doc files detection
Merged 2 years ago by zbyszek. Opened 3 years ago by eclipseo.
fedora-rust/ eclipseo/rust2rpm license_detection  into  master

file modified
+55 -12
@@ -5,6 +5,7 @@ 

  import difflib

  import itertools

  import os

+ import re

  import shlex

  import shutil

  import sys
@@ -32,6 +33,15 @@ 

      extensions=["jinja2.ext.do"],

      trim_blocks=True,

      lstrip_blocks=True)

+ LICENSES = re.compile(

+          r"(COPYING|COPYING[\.\-].*|COPYRIGHT|COPYRIGHT[\.\-].*|"

+          r"EULA|EULA[\.\-].*|licen[cs]e|licen[cs]e.*|LICEN[CS]E|"

+          r"LICEN[CS]E[\.\-].*|.*[\.\-]LICEN[CS]E.*|NOTICE|NOTICE[\.\-].*|"

+          r"PATENTS|PATENTS[\.\-].*|UNLICEN[CS]E|UNLICEN[CS]E[\.\-].*|"

+          r"agpl[\.\-].*|gpl[\.\-].*|lgpl[\.\-].*|AGPL-.*[0-9].*|"

+          r"APACHE-.*[0-9].*|BSD-.*[0-9].*|CC-BY-.*|GFDL-.*[0-9].*|"

+          r"GNU-.*[0-9].*|GPL-.*[0-9].*|LGPL-.*[0-9].*|MIT-.*[0-9].*|"

+          r"MPL-.*[0-9].*|OFL-.*[0-9].*)")

  

  def get_default_target():

      try:
@@ -111,9 +121,11 @@ 

  

  def local_toml(toml, version):

      if os.path.isdir(toml):

+         doc_files = get_doc_files(toml)

+         license_files = get_license_files(toml)

          toml = os.path.join(toml, "Cargo.toml")

  

-     return toml, None, version

+     return toml, None, version, doc_files, license_files

  

  def local_crate(crate, version):

      cratename, version = os.path.basename(crate)[:-6].rsplit("-", 1)
@@ -143,7 +155,7 @@ 

      return cratef, crate, version

  

  @contextlib.contextmanager

- def toml_from_crate(cratef, crate, version):

+ def files_from_crate(cratef, crate, version):

      with tempfile.TemporaryDirectory() as tmpdir:

          target_dir = f"{tmpdir}/"

          with tarfile.open(cratef, "r") as archive:
@@ -155,7 +167,10 @@ 

          toml = f"{tmpdir}/{toml_relpath}"

          if not os.path.isfile(toml):

              raise IOError("crate does not contain Cargo.toml file")

-         yield toml

+         root_path = f"{tmpdir}/{crate}-{version}"

+         doc_files = get_doc_files(root_path)

+         license_files = get_license_files(root_path)

+         yield toml, doc_files, license_files

  

  def make_patch(toml, enabled=True, tmpfile=False):

      if not enabled:
@@ -189,6 +204,31 @@ 

  def _is_path(path):

      return "/" in path or path in {".", ".."}

  

+ def get_license_files(path):

+     license_files = []

+     exclude = { "vendor", "example", "examples", "_example", "_examples",

+                    "testdata", "_testdata", ".github", "tests", "test"}

+     for root, dirs, files in os.walk(path, topdown=True):

+         dirs[:] = [d for d in dirs if d not in exclude]

+         license_files.extend([os.path.relpath(os.path.join(root, f), path) for f in files if LICENSES.match(f)])

+     return license_files

+ 

+ def get_doc_files(path):

+     doc_files = []

+     include = {"doc", "docs", "example", "examples", "_example", "_examples"}

+     exclude = {"vendor", ".github", "tests", "test", ".circleci"}

+     matcher = re.compile(

+         r"(.*\.md|.*\.markdown|.*\.mdown|.*\.mkdn|.*\.rst|.*\.txt|AUTHORS|"

+         r"AUTHORS[\.\-].*|CONTRIBUTORS|CONTRIBUTORS[\.\-].*|README|"

+         r"README[\.\-].*|CHANGELOG|CHANGELOG[\.\-].*|TODO|TODO[\.\-].*)",

+         re.IGNORECASE)

+     matcherex = re.compile(r"CMakeLists\.txt")

+     for root, dirs, files in os.walk(path, topdown=True):

+         doc_files = doc_files + [d for d in dirs if d in include]

+         dirs[:] = [d for d in dirs if d not in exclude]

+         doc_files.extend([os.path.relpath(os.path.join(root, f), path) for f in files if matcher.match(f) and not LICENSES.match(f) and not matcherex.match(f)])

+     return doc_files

+ 

  def make_diff_metadata(crate, version, patch=False, store=False):

      if _is_path(crate):

          # Only things that look like a paths are considered local arguments
@@ -198,19 +238,20 @@ 

              if store:

                  raise ValueError("--store-crate can only be used for a crate")

  

-             toml, crate, version = local_toml(crate, version)

+             toml, crate, version, doc_files, license_files = local_toml(crate, version)

              diff = make_patch(toml, enabled=patch, tmpfile=True)

              metadata = Metadata.from_file(toml)

-             return metadata.name, diff, metadata

+             return metadata.name, diff, metadata, doc_files, license_files

      else:

          cratef, crate, version = download(crate, version)

  

-     with toml_from_crate(cratef, crate, version) as toml:

-         diff = make_patch(toml, enabled=patch)

-         metadata = Metadata.from_file(toml)

+     with files_from_crate(cratef, crate, version) as files:

+         diff = make_patch(files[0], enabled=patch)

+         metadata = Metadata.from_file(files[0])

+         doc_files, license_files = files[1:3]

      if store:

          shutil.copy2(cratef, os.path.join(os.getcwd(), f"{metadata.name}-{version}.crate"))

-     return crate, diff, metadata

+     return crate, diff, metadata, doc_files, license_files

  

  def to_list(s):

      if not s:
@@ -264,9 +305,8 @@ 

      if args.crate is None:

          parser.error("required crate/path argument missing")

  

-     crate, diff, metadata = make_diff_metadata(args.crate, args.version,

-                                                patch=args.patch,

-                                                store=args.store_crate)

+     crate, diff, metadata, doc_files, license_files = make_diff_metadata(

+       args.crate, args.version, patch=args.patch, store=args.store_crate)

  

      JINJA_ENV.globals["normalize_deps"] = normalize_deps

      JINJA_ENV.globals["to_list"] = to_list
@@ -339,6 +379,9 @@ 

          kwargs["license"] = license

          kwargs["license_comments"] = comments

  

+     kwargs["doc_files"] = doc_files

+     kwargs["license_files"] = license_files

+ 

      conf = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())

      conf.read(["_rust2rpm.conf", ".rust2rpm.conf"])

      if args.target not in conf:

file modified
+8 -8
@@ -108,11 +108,11 @@ 

  %description -n %{crate} %{_description}

  

  %files       -n %{crate}

-   {% if md.license_file is not none %}

- %license {{ md.license_file }}

+   {% if license_files|length > 0 %}

+ %license {{ license_files|join(' ') }}

    {% endif %}

-   {% if md.readme is not none %}

- %doc {{ md.readme }}

+   {% if doc_files|length > 0 %}

+ %doc {{ doc_files|join(' ') }}

    {% endif %}

    {% for bin in bins %}

  %{_bindir}/{{ bin.name }}
@@ -167,11 +167,11 @@ 

  

  %files       {{ pkg }}

      {% if feature is none %}

-       {% if md.license_file is not none %}

- %license {{ md.license_file }}

+       {% if license_files|length > 0 %}

+ %license {{ license_files|join(' ') }}

        {% endif %}

-       {% if md.readme is not none %}

- %doc {{ md.readme }}

+       {% if doc_files|length > 0 %}

+ %doc {{ doc_files|join(' ') }}

        {% endif %}

  %{cargo_registry}/%{crate}-%{version_no_tilde}/

      {% else %}

Signed-off-by: Robert-André Mauchin zebob.m@gmail.com

Black messed up the indentation a bit, hope it's not a problem.

Maybe make this a global? Also use {}:

EXCLUDED_FILES = {
   'vendor',
   'example',
  ...
}

Using a list comprehension would be nicer here...

So this is almost the same as the regex in get_license_files() above... I guess one should be a subset of the other one. Please don't duplicate this, but extract the common part.

I think this is pretty useful. Hacky, but it should mostly work in practice.

But to get this merged, I would ask you to refactor the patch a bit. The changes done by black are pretty bad. Not only do they make a lot of noise which hides the real changes, but also most of the formatting changes are for the worse. Please just exclude them from the patch.

Maybe make this a global? Also use {}:
EXCLUDED_FILES = { 'vendor', 'example', ... }

It's only used once, I don't see the point of making it global.

I'll address the other points.

rebased onto c6d12ef8b0d6e881c2fc974423655ecd19e45c91

3 years ago

rebased onto 5aa97963f699989f95af3050292ca6c12a75085a

3 years ago

rebased onto 0a91ff0

3 years ago

This looks pretty useful, and the code seems sane to me. @zbyszek could you have another look? I can technically merge this, but you're more familiar with the codebase so it's probably best for you to do it.

Sorry for the delay. Looks nice.

Pull-Request has been merged by zbyszek

2 years ago