1"""Diffs one repo source tree an upstream repo source tree.
2
3Matches the projects from a Gerrit repo workspace to the projects
4of an upstream workspace. After identifying exist both in the
5downstream and the upstream workspace it then diffs the each project.
6
7Finally, the results of the project matching and diffing are reported.
8
9"""
10
11from __future__ import absolute_import
12from __future__ import division
13from __future__ import print_function
14import argparse
15import csv
16import datetime
17import multiprocessing
18import multiprocessing.pool
19import os
20import re
21import subprocess
22import xml.etree.ElementTree as et
23import git_commits_not_upstreamed
24
25
26def get_projects(source_tree):
27  """Retrieve the dict of projects names and paths.
28
29  Args:
30    source_tree: A path to the source tree.
31
32  Returns:
33    A dict of project paths keyed by project names.
34  """
35
36  projects = {}
37
38  manifest = source_tree + '/.repo/manifest.xml'
39  tree = et.parse(manifest)
40  root = tree.getroot()
41
42  for project in root.findall('project'):
43    # Ignore projects that are not synced by default
44    if 'notdefault' in project.get('groups', ''):
45      continue
46    path = project.get('path', project.get('name'))
47    path = os.path.abspath(os.path.join(source_tree, path))
48    name = project.get('name')
49
50    # check if project files actually exist
51    if not os.path.exists(path):
52      continue
53
54    projects[name] = path
55
56  return projects
57
58
59def git(args):
60  """Git command.
61
62  Args:
63    args: A list of arguments to be sent to the git command.
64
65  Returns:
66    The output of the git command.
67  """
68
69  command = ['git']
70  command.extend(args)
71  with open(os.devnull, 'w') as devull:
72    return subprocess.check_output(command, stderr=devull)
73
74
75def get_revision_diff_stats(directory, rev_a, rev_b):
76  """Retrieves stats of diff between two git revisions.
77
78  Args:
79    directory: A path to the git directory to diff.
80    rev_a: A git revision to diff.
81    rev_b: A git revision to diff.
82
83  Returns:
84    A dict with the count of files modified, lines added
85    and lines removed.
86  """
87  stats = {
88      'file': 0,
89      'insertion': 0,
90      'deletion': 0,
91  }
92
93  git_diffstat = git(
94      ['-C', directory, 'diff', '--shortstat', rev_a, rev_b])
95  for element in git_diffstat.split(','):
96    for key in stats:
97      if key in element:
98        stats[key] = int(element.split()[0])
99
100  return stats
101
102
103def get_project_stats(upstream_dir, downstream_dir):
104  """Retrieves stats of diff between two git projects.
105
106  Diffs a downstream directory against an upstream directory.
107  Lines that exist only in the downstream directory are considered insertions.
108  Lines that exist only in the upstream directory are considered deletions.
109
110  Args:
111    upstream_dir: A path to the upstream directory to compare.
112    downstream_dir: A path to the downstream directory to compare.
113
114  Returns:
115    A dict with the count of files modified, lines added
116    and lines removed.
117  """
118  stats = {
119      'file': 0,
120      'insertion': 0,
121      'deletion': 0,
122  }
123
124  if upstream_dir and downstream_dir:
125    print('Diffing %s vs %s' % (downstream_dir, upstream_dir))
126    git(['-C', downstream_dir, 'fetch', '--update-shallow', upstream_dir])
127    stats = get_revision_diff_stats(downstream_dir, 'FETCH_HEAD', 'HEAD')
128
129  return stats
130
131
132def match_project_by_root_commits(
133    downstream_project_name, downstream_project_path, upstream_root_commits):
134  """Match a downstream project to an upstream project using their root commits.
135
136  Find all root commits in a downstream project and find a matching
137  upstream project that have a root commit in common.
138
139  Args:
140    downstream_project_name: A string with the downstream project name.
141    downstream_project_path: A string with the downstream project path.
142    upstream_root_commits: A dict of root commits and their upstream project.
143
144  Returns:
145    A string with the matched upstream project name.
146  """
147  upstream_match = None
148  downstream_root_commits = find_root_commits_in_path(downstream_project_path)
149  for root in downstream_root_commits:
150    if root in upstream_root_commits:
151      upstream_project_list = upstream_root_commits[root]
152      if len(upstream_project_list) > 1:
153        print('Warning: ' + downstream_project_name +
154              ' matches multiple projects')
155        print(upstream_project_list)
156      else:
157        upstream_match = upstream_project_list[0]['name']
158      # Once there's a root commit match, stop looking for a project match
159      break
160
161  return upstream_match
162
163
164def match_projects(upstream_projects, downstream_projects):
165  """Match downstream projects to upstream projects.
166
167  Args:
168    upstream_projects: A dict of upstream projects.
169    downstream_projects: A dict of downstream projects.
170
171  Returns:
172    A list of upstream and downstream project pairs.
173  """
174
175  project_matches = []
176
177  # keep a list of upstream projects that have not been matched
178  unmatched_upstream_projects = set(upstream_projects.keys())
179
180  upstream_root_commits = find_root_commits_in_projects(upstream_projects)
181  # Match all downstream projects to an upstream project
182  for downstream_name, downstream_path in downstream_projects.iteritems():
183    # First try to match projects by name
184    if downstream_name in upstream_projects:
185      upstream_match = downstream_name
186    # If there is no project name match then try matching by commit
187    else:
188      upstream_match = match_project_by_root_commits(
189          downstream_name, downstream_path, upstream_root_commits)
190
191    project_matches.append({
192        'upstream': upstream_match,
193        'downstream': downstream_name,
194    })
195    unmatched_upstream_projects.discard(upstream_match)
196
197  # Add all upstream projects that have not been matched
198  for project in unmatched_upstream_projects:
199    project_matches.append({
200        'upstream': project,
201        'downstream': None,
202    })
203
204  return project_matches
205
206
207def filter_exclusion_list(projects, exclusion_file):
208  """Removes all projects that match the exclusion patterns."""
209
210  filtered = {}
211
212  exclusion_list = []
213  if exclusion_file:
214    with open(exclusion_file) as f:
215      exclusion_list = f.readlines()
216  exclusion_list = [line.strip() for line in exclusion_list]
217  exclusion_pattern = '|'.join(exclusion_list)
218
219  if exclusion_pattern:
220    for name, path in projects.iteritems():
221      if re.match(exclusion_pattern, name):
222        print('Excluding ' + name)
223      else:
224        filtered[name] = path
225  else:
226    filtered = projects
227
228  return filtered
229
230
231def get_all_projects_stats(upstream_source_tree,
232                           downstream_source_tree,
233                           exclusion_file):
234  """Finds the stats of all project in a source tree.
235
236  Args:
237    upstream_source_tree: A string with the path to the upstream gerrit
238      source tree.
239    downstream_source_tree: A string with the path to the downstream gerrit
240      source tree.
241    exclusion_file: A string with the path to the exclusion file.
242
243  Returns:
244    A list of dicts of matching upstream and downstream projects
245    including stats for projects that matches.
246  """
247  upstream_projects, downstream_projects = map(
248    lambda t: get_projects_with_filter(t, exclusion_file),
249    (upstream_source_tree, downstream_source_tree),
250  )
251
252  return multiprocessing.pool.ThreadPool(
253    processes=multiprocessing.cpu_count()
254  ).map(
255    lambda match: stats_from_match(
256      upstream_projects,
257      downstream_projects,
258      match,
259    ),
260    match_projects(upstream_projects, downstream_projects),
261  )
262
263
264def stats_from_match(upstream_projects, downstream_projects, match):
265  """Finds the stats of a single match of two projects.
266
267  Args:
268    upstream_projects: list of dicts obtained from get_project_stats
269    downstream_projects: list of dicts obtained from get_project_stats
270    match: a single match dict obtained from match_projects
271
272  Returns:
273    A dict of stats for this particular match
274  """
275
276  def display_status(upstream_project_name,
277                      downstream_project_name,
278                      project_stats):
279    if not upstream_project_name:
280      return 'Downstream Only Projects'
281    elif not downstream_project_name:
282      return 'Upstream Only Projects'
283    elif project_stats['file'] == 0:
284      return 'Intact Projects'
285    elif upstream_project_name == downstream_project_name:
286      return 'Modified Projects'
287    return 'Forked Projects'
288
289  upstream_project_name = match['upstream']
290  downstream_project_name = match['downstream']
291
292  project_stats = get_project_stats(
293    upstream_projects.get(upstream_project_name),
294    downstream_projects.get(downstream_project_name),
295  )
296  project_stats.update({
297    'status': display_status(
298      upstream_project_name,
299      downstream_project_name,
300      project_stats
301    ),
302    'downstream_path': downstream_projects.get(downstream_project_name)
303  })
304  project_stats.update(match)
305  return project_stats
306
307
308def get_projects_with_filter(source_tree, exclusion_file):
309  """ Helper function to get projects with an exclusion file filter applied."""
310  return filter_exclusion_list(
311    get_projects(source_tree),
312    exclusion_file,
313  )
314
315
316def find_root_commits_in_path(path):
317  """Returns a list of root commits in a git project path."""
318  print('Analyzing history of ' + path)
319  rev_list = git(['-C', path, 'rev-list', '--max-parents=0', 'HEAD'])
320  return rev_list.splitlines()
321
322
323def find_root_commits_in_projects(projects):
324  """Returns a dict of root commits with all projects with that root commit."""
325  root_commits = {}
326  for name, path in projects.iteritems():
327    for root in find_root_commits_in_path(path):
328      root_list = root_commits.get(root, [])
329      root_list.append({
330          'name': name,
331          'path': path,
332      })
333      root_commits[root] = root_list
334  return root_commits
335
336
337def get_commit_stats_in_project(project):
338  """Extract commits that have not been upstreamed in a specific project.
339
340  Args:
341    project: A dict of a project name and path.
342
343  Returns:
344    A dict of commits not upstreamed.
345  """
346  name = project['name']
347  path = project['downstream_path']
348  print('Finding commits not upstreamed in ' + name)
349  commits = git_commits_not_upstreamed.find('FETCH_HEAD', 'HEAD', path)
350  print('Found commits not upstreamed in ' + name)
351  stats = []
352  for commit in commits:
353    author = git(['-C', path, 'show', '--no-patch', '--format=%ae', commit])
354    author = author.strip()
355    subject = git(['-C', path, 'show', '--no-patch', '--format=%s', commit])
356    subject = subject.strip()
357    stats.append({
358        'commit': commit,
359        'author': author,
360        'subject': subject,
361    })
362
363  return {
364      'name': name,
365      'stats': stats,
366  }
367
368
369def get_all_commits_stats(project_stats):
370  """Extract commits that have not been upstreamed in all projects.
371
372  Args:
373    project_stats: A dict of matching upstream and downstream projects
374      including stats for projects that matches.
375
376  Returns:
377    A dict of commits not upstreamed.
378  """
379  commit_stats = {}
380  downstream_stats = {match['downstream']: match for match in project_stats}
381
382  # Only analyze modified projects
383  modified_projects = []
384  for name, stats in downstream_stats.iteritems():
385    if stats['status'].startswith('Modified'):
386      stats['name'] = name
387      modified_projects.append(stats)
388
389  pool = multiprocessing.Pool()
390
391  commit_stats = pool.map(get_commit_stats_in_project, modified_projects)
392
393  commit_stats = {stats['name']: stats['stats'] for stats in commit_stats}
394
395  return commit_stats
396
397
398def write_commit_csv(commit_stats, commit_output_file):
399  """Write project comparison data to a CSV file.
400
401  Args:
402    commit_stats: The dict of the stats for all commits.
403    commit_output_file: Path to the output file.
404  """
405  with open(commit_output_file, 'w') as f:
406    fieldnames = [
407        'Date',
408        'Commit',
409        'Downstream Project',
410        'Author',
411        'Subject',
412    ]
413    today = datetime.datetime.today().strftime('%Y/%m/%d')
414    writer = csv.DictWriter(f, fieldnames=fieldnames)
415    writer.writeheader()
416    for project, stats in commit_stats.iteritems():
417      for stat in stats:
418        writer.writerow({
419            'Date': today,
420            'Commit': stat['commit'],
421            'Downstream Project': project,
422            'Author': stat['author'],
423            'Subject': stat['subject'],
424        })
425  print('Wrote commit stats to ' + commit_output_file)
426
427
428def write_project_csv(project_stats, commit_stats, project_output_file):
429  """Write project comparison data to a CSV file.
430
431  Args:
432    project_stats: The dict of the stats for all projects.
433    commit_stats: The dict of the stats for all commits.
434    project_output_file: Path to the output file.
435  """
436  with open(project_output_file, 'w') as f:
437    fieldnames = [
438        'Date',
439        'Downstream Project',
440        'Upstream Project',
441        'Diff Status',
442        'Files Changed',
443        'Line Insertions',
444        'Line Deletions',
445        'Line Changes',
446        'Commits Not Upstreamed',
447    ]
448    writer = csv.DictWriter(f, fieldnames=fieldnames)
449    writer.writeheader()
450    today = datetime.datetime.today().strftime('%Y/%m/%d')
451    for stat in project_stats:
452      commits_not_upstreamed = 0
453      downstream_project = stat['downstream']
454      if downstream_project in commit_stats:
455        commits_not_upstreamed = len(commit_stats[downstream_project])
456      writer.writerow({
457          'Date': today,
458          'Downstream Project': downstream_project,
459          'Upstream Project': stat['upstream'],
460          'Diff Status': stat['status'],
461          'Files Changed': stat['file'],
462          'Line Insertions': stat['insertion'],
463          'Line Deletions': stat['deletion'],
464          'Line Changes': stat['insertion'] + stat['deletion'],
465          'Commits Not Upstreamed': commits_not_upstreamed,
466      })
467  print('Wrote project stats to ' + project_output_file)
468
469
470def diff(upstream_source_tree, downstream_source_tree, project_output_file,
471         commit_output_file, exclusions_file):
472  """Diff one repo source tree against another.
473
474  Args:
475    upstream_source_tree: A string with the path to a gerrit source tree.
476    downstream_source_tree: A string with the path to a gerrit source tree.
477    project_output_file: Path to the project output file.
478    commit_output_file: Path to the commit output file.
479    exclusions_file: Path to exclusions file.
480  """
481  project_stats = get_all_projects_stats(upstream_source_tree,
482                                         downstream_source_tree,
483                                         exclusions_file)
484  commit_stats = get_all_commits_stats(project_stats)
485  write_commit_csv(commit_stats, commit_output_file)
486  write_project_csv(project_stats, commit_stats, project_output_file)
487
488
489def main():
490  parser = argparse.ArgumentParser(
491      description='Diff a repo source tree against an upstream source tree.')
492  parser.add_argument('upstream_path', help='Path to an upstream source tree.')
493  parser.add_argument(
494      'downstream_path', help='Path to a downstream source tree.')
495  parser.add_argument(
496      '-p',
497      '--project_output_file',
498      help='Path to write the project output file',
499      default='project.csv',)
500  parser.add_argument(
501      '-c',
502      '--commit_output_file',
503      help='Path to write the commit output file',
504      default='commit.csv',)
505  parser.add_argument(
506      '-e',
507      '--exclusions_file',
508      help='Path to file with a list of project names to be excluded from'
509      'the diff. You may use a regular expression to match project names as'
510      'described in https://docs.python.org/2/howto/regex.html',
511      default='',
512  )
513  args = parser.parse_args()
514  upstream_source_tree = os.path.abspath(args.upstream_path)
515  downstream_source_tree = os.path.abspath(args.downstream_path)
516  project_output_file = os.path.abspath(args.project_output_file)
517  commit_output_file = os.path.abspath(args.commit_output_file)
518  exclusions_file = ''
519  if args.exclusions_file:
520    exclusions_file = os.path.abspath(args.exclusions_file)
521
522  diff(upstream_source_tree, downstream_source_tree, project_output_file,
523       commit_output_file, exclusions_file)
524
525
526if __name__ == '__main__':
527  main()
528