PR#115: Convert wiki/RST format to Markdown for Trac Ticket Migration - pagure-importer

pagure-importer

#115 Convert wiki/RST format to Markdown for Trac Ticket Migration

Opened 7 years ago by mreynolds. Modified 7 years ago

mreynolds/pagure-importer issue91 into master

Break main conversion function into small functions. Also improved code block conversion.

Mark Reynolds • 7 years ago

a41618e

Fix pep8 errors

Mark Reynolds • 7 years ago

7fc0acf

Convert wiki/RST format to Markdown for Trac Ticket Migration

Mark Reynolds • 7 years ago

9cf4e4a

pagure_importer/utils/importer_trac.py

file modified

+193 -3

		`@@ -11,6 +11,19 @@`
		`get_close_status, is_image, issue_to_json, get_secure_filename)`
		`from pagure_importer.utils.models import User, Issue, IssueComment`

		`+ wikilink_pattern = re.compile('\[http(.*)\]')`
		`+ wikilink_extract = re.compile('\[(.*)\]')`
		`+ wikiheading1_pattern = re.compile('^= (.*) =$')`
		`+ wikiheading2_pattern = re.compile('^== (.*) ==$')`
		`+ wikiheading3_pattern = re.compile('^=== (.*) ===$')`
		`+ strikethrough_pattern = re.compile('~~(.*)~~')`
		`+ # Trac priorities`
		`+ priority_map = {'blocker': 1,`
		`+ 'critical': 2,`
		`+ 'major': 3,`
		`+ 'minor': 4,`
		`+ 'trivial': 5}`
		`+`

		`def to_timestamp(tm):`
		`''' Convert to timestamp which can be jsonified '''`
		`@@ -21,11 +34,91 @@`
		`return ts`


		`+ def strip_wikilink(content):`
		`+ ''' Need to remove wiki link format from custom fields. They come in a`
		`+ variety of forms that can be comma or whitespace separated. They can also`
		`+ include link names which must also be removed.`
		`+`
		`+ [https://bugzilla.redhat.com/show_bug.cgi?id=772777]`
		`+ [https://bugzilla.com/123456789], [http://bugzilla.com/7777777 7777777]`
		`+ [https://bugzilla.com/6666666 6666666]`
		`+ '''`
		`+`
		`+ links = []`
		`+ if wikilink_pattern.search(content):`
		`+ # Looks like we have a link in here`
		`+ links = []`
		`+ mylist = re.findall(r'\[([^]]*)\]', content)`
		`+ for i in mylist:`
		`+ links.append(i.split(' ', 1)[0])`
		`+ return ', '.join(links)`
		`+ else:`
		`+ return content`
		`+`
		`+`
		`+ def convert_headers(line):`
		`+ ''' Convert wikiformat headers`
		`+ '''`
		`+ level_count = 1`
		`+ for header in [wikiheading1_pattern,`
		`+ wikiheading2_pattern,`
		`+ wikiheading3_pattern]:`
		`+ try:`
		`+ level = header.search(line).group(1)`
		`+ if level:`
		`+ line = "%s %s" % ('#' * level_count, level)`
		`+ break # No need to check other heading levels`
		`+ except:`
		`+ # Try the next heading level`
		`+ pass`
		`+ level_count += 1`
		`+`
		`+ return line`
		`+`
		`+`
		`+ def convert_wikilinks(line):`
		`+ ''' Convert wikiformat links`
		`+ '''`
		`+ if wikilink_pattern.search(line):`
		`+ try:`
		`+ result = wikilink_extract.search(line).group(1)`
		`+ if result:`
		`+ parts = result.split(' ', 1)`
		`+ if len(parts) == 1:`
		`+ mdlink = '[%s](%s)' % (parts[0], parts[0])`
		`+ elif len(parts) == 2:`
		`+ mdlink = '[%s](%s)' % (parts[1], parts[0])`
		`+ line = line.replace('[' + result + ']', mdlink)`
		`+ except:`
		`+ # Not a link, not a problem`
		`+ pass`
		`+`
		`+ return line`
		`+`
		`+`
		`+ def convert_strike(line):`
		`+ ''' Convert wikiformat striked text`
		`+ '''`
		`+ striked_result = strikethrough_pattern.search(line)`
		`+ if striked_result:`
		`+ try:`
		`+ striked_text = striked_result.group(1)`
		`+ if striked_text:`
		`+ orig_text = '~~%s~~' % striked_text`
		`+ new_text = '<s>%s</s>' % striked_text`
		`+ line = line.replace(orig_text, new_text)`
		`+ except:`
		`+ # Not striked`
		`+ pass`
		`+ return line`
		`+`
		`+`
		`class TracImporter(object):`
		`''' Pagure importer for trac instance '''`

		`def __init__(self, project_url, username, password, offset, repo_name,`
		`- repo_folder, nopush, fasclient=None, tags=False, private=False):`
		`+ repo_folder, nopush, fasclient=None, tags=False,`
		`+ private=False):`
		`''' Instantiate a TracImporter object '''`
		`self.username = username`
		`self.password = password`
		`@@ -84,7 +177,12 @@`
		`all_ticket_fields = self.request('ticket.getTicketFields')`
		`custom_fields = []`
		`for field in all_ticket_fields:`
		`- if field.get('custom') is True:`
		`+ if (field.get('custom') is True or`
		`+ field.get('label').lower() == "component" or`
		`+ field.get('label').lower() == "keywords" or`
		`+ field.get('label').lower() == "version" or`
		`+ field.get('label').lower() == "type" or`
		`+ field.get('label').lower() == "cc"):`
		`current_field = {}`
		`current_field['name'] = field['name']`
		`key_type = 'text'`
		`@@ -92,8 +190,85 @@`
		`key_type = 'boolean'`
		`current_field['key_type'] = key_type`
		`custom_fields.append(current_field)`
		`+`
		`return custom_fields`

		`+ def WikiToMD(self, content):`
		`+ ''' Convert wiki/RST format to Markdown. Code blocks, bold/italics,`
		`+ wiki links, lists, striked text, and headers. '''`
		`+`
		`+ code_block = False`
		`+ in_list = False`
		`+ nested_level = 0`
		`+ prev_indent = 0`
		`+ new_content = ""`
		`+`
		`+ for line in content.split('\n'):`
		`+ line = line.replace("\r", "")`
		`+ if "{{{" in line:`
		`+ code_block = True`
		+ line = line.replace("{{{", "```")
		`+ if "}}}" in line:`
		`+ code_block = False`
		+ line = line.replace("}}}", "```")
		`+ if not code_block:`
		`+ #`
		`+ # Convert bullet lists. The start and end of a list needs`
		`+ # an empty line. wikiformat uses both '*' and '-' for its`
		`+ # lists. However, markdown only supports '-'.`
		`+ #`
		`+ if line.startswith('* '):`
		`+ if not in_list:`
		`+ new_content = "%s\n" % (new_content)`
		`+ in_list = True`
		`+ line = line[1:]`
		`+ line = '-%s' % (line)`
		`+ elif line.startswith('- '):`
		`+ # No need to modify the line, just add the new line`
		`+ if not in_list:`
		`+ new_content = "%s\n" % (new_content)`
		`+ in_list = True`
		`+ elif line.startswith(' '):`
		`+ # Check for nested lists`
		`+ nested_line = line.lstrip(' ')`
		`+ if nested_line.startswith('* ') or \`
		`+ nested_line.startswith('- '):`
		`+ # Adjust the nested list level as needed`
		`+ indent = len(line) - len(nested_line)`
		`+ if indent > prev_indent:`
		`+ nested_level += 1`
		`+ elif indent < prev_indent:`
		`+ nested_level -= 1`
		`+ prev_indent = indent`
		`+`
		`+ # Set the proper indentation for markdown`
		`+ line = ('%s-%s' % (' ' * nested_level,`
		`+ nested_line[1:]))`
		`+ else:`
		`+ if in_list:`
		`+ # Add the closing empty line`
		`+ new_content = "%s\n" % (new_content)`
		`+ in_list = False`
		`+ nested_level = 0`
		`+ prev_indent = 0`
		`+`
		`+ # Convert headers`
		`+ line = convert_headers(line)`
		`+`
		`+ # Convert wiki links`
		`+ line = convert_wikilinks(line)`
		`+`
		`+ # Convert striked through text`
		`+ line = convert_strike(line)`
		`+`
		`+ # Convert bold and italic text (do this last)`
		`+ line = line.replace("'''", "**") # Convert bold text`
		`+ line = line.replace("''", "*") # Convert italic text`
		`+`
		`+ new_content = "%s%s\n" % (new_content, line)`
		`+`
		`+ return new_content`
		`+`
		`def import_issues(self, repo_name, trac_query='max=0&order=id'):`
		`''' Queries the trac instance via its jsonrpc API and convert the`
		`tickets into JSON blob to be imported into pagure's ticket git repo.`
		`@@ -103,7 +278,6 @@`
		`:kwarg trac_query: the query to call trac with in order to retrieve`
		`all the tickets.`
		Defaults to ``max=0&order=id``
		`-`
		`'''`

		`tickets_id = self.request('ticket.query', trac_query)`
		`@@ -130,9 +304,14 @@`
		`else:`
		`comments[key].comment += ('\n[%s](%s)' %`
		`(attach_name, url))`
		`+ else:`
		`+ # Convert any RST formatting to Markdown`
		`+ comments[key].comment = \`
		`+ self.WikiToMD(comments[key].comment)`
		`pagure_issue.comments.append(comments[key].to_json())`
		`click.echo('Updated ' + repo_name + ' with issue :' +`
		`str(ticket_id) + '/' + str(tickets_id[-1]))`
		`+`
		`issue_to_json(pagure_issue, self.clone_repo_location)`

		`def get_custom_fields_of_ticket(self, trac_ticket):`
		`@@ -149,6 +328,8 @@`
		`pagure_field['value'] = trac_ticket.get(`
		`pagure_field['name'], "").strip()`
		`if pagure_field['value']:`
		`+ pagure_field['value'] = \`
		`+ strip_wikilink(pagure_field['value'])`
		`pagure_fields.append(pagure_field)`
		`return pagure_fields`

		`@@ -180,6 +361,8 @@`
		`pagure_issue_content = trac_ticket['description']`
		`if pagure_issue_content == '':`
		`pagure_issue_content = '#No Description Provided'`
		`+ else:`
		`+ pagure_issue_content = self.WikiToMD(pagure_issue_content)`

		`issue_status, close_status = self.get_ticket_status(trac_ticket)`

		`@@ -207,6 +390,12 @@`
		`if 'milestone' in trac_ticket and trac_ticket['milestone'] != '':`
		`pagure_milestone = trac_ticket['milestone']`

		`+ # The priority of the issue`
		`+ priority = None`
		`+ if 'priority' in trac_ticket and trac_ticket['priority'] != '' and \`
		`+ trac_ticket['priority'] in priority_map:`
		`+ priority = priority_map[trac_ticket['priority']]`
		`+`
		`# Issue tags`
		`pagure_issue_tags = []`
		`if self.tags:`
		`@@ -227,6 +416,7 @@`
		`title=pagure_issue_title,`
		`content=pagure_issue_content,`
		`status=issue_status,`
		`+ priority=priority,`
		`close_status=close_status,`
		`date_created=pagure_issue_created_at,`
		`user=pagure_issue_user.to_json(),`

pagure_importer/utils/models.py

file modified

+3 -1

		`@@ -9,12 +9,13 @@`
		`self, id, title, content,`
		`status, date_created, user, private, attachment, tags,`
		`depends, blocks, assignee, close_status, comments=None,`
		`- milestone=None, custom_fields=None):`
		`+ milestone=None, custom_fields=None, priority=None):`

		`self.id = id`
		`self.title = title`
		`self.content = content`
		`self.status = status`
		`+ self.priority = priority`
		`self.close_status = close_status`
		`self.date_created = date_created`
		`self.user = user`
		`@@ -42,6 +43,7 @@`
		`'date_created': self.date_created,`
		`'user': self.user,`
		`'private': self.private,`
		`+ 'priority': self.priority,`
		`'tags': self.tags,`
		`'depends': self.depends,`
		`'blocks': self.blocks,`

mreynolds commented 7 years ago

This patch convert a majority of wiki/RST formats to Markdown. It also strips the wikilink format from custom fields, and converts the trac priority to the Pagure priority format.

1 new commit added

Fix pep8 errors

7 years ago

bowlofeggs commented on line 200 of pagure_importer/utils/importer_trac.py 7 years ago

This method is around 120 lines of code, which makes it challenging to read. I recommend breaking it into smaller helper methods so that each logical component is easier to understand, and most importantly, easier to test.

bowlofeggs commented on line 89 of pagure_importer/utils/importer_trac.py 7 years ago

Would you not want to replace \r with \n?

bowlofeggs commented 7 years ago

This change looks reasonable to me. In addition to my other recommendations, I suggest writing automated tests for this code. It would help prove that it works as expected, and it actually seems like it could be a fun set of tests too.

mreynolds commented on line 90 of pagure_importer/utils/importer_trac.py 7 years ago

Actually, in Trac I always see this pattern: "\r\n". I never see "\r" by itself

rebased

7 years ago

rharwood commented 7 years ago

This is really helpful; thanks, @mreynolds.

It could also detect commit hashes and generate code links like Trac does. I used the following (which you are welcome to take in the event that there isn't a better way to do this):

def convert_hashes(line):
    hashes = re.findall(r"\b[0-9a-f]{40}\b", line)
    for h in hashes:
        line = line.replace(h, "[%s](/gssproxy/c/%s)" % (h[:7], h))
    return line