Skip to content

Commit 3089019

Browse files
author
Miltos Allamanis
committed
Improve and expose CLI arguments for clone detection.
1 parent cace0f0 commit 3089019

File tree

4 files changed

+67
-37
lines changed

4 files changed

+67
-37
lines changed

DuplicateCodeDetector/CloneDetector.cs

+24-11
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
using Newtonsoft.Json;
2+
using Newtonsoft.Json.Linq;
23
using System;
34
using System.Collections.Concurrent;
45
using System.Collections.Generic;
6+
using System.Diagnostics;
57
using System.IO;
68
using System.IO.Compression;
79
using System.Linq;
@@ -13,16 +15,20 @@ namespace NearCloneDetector
1315
{
1416
class CloneDetector
1517
{
16-
public struct TokenData
17-
{
18-
public string filename;
19-
public string[] tokens;
20-
}
21-
2218
private readonly Dictionary<string, Dictionary<string, SparseVector>> _index = new Dictionary<string, Dictionary<string, SparseVector>>();
2319
public int NumFiles => _index.Sum(prj => prj.Value.Count);
2420
private readonly FeatureDictionary _dict = new FeatureDictionary();
2521

22+
private readonly string _tokensFieldName;
23+
private readonly string[] _identifyingFields;
24+
25+
public CloneDetector(string tokensField, string[] entryIdFields)
26+
{
27+
_tokensFieldName = tokensField;
28+
Debug.Assert(entryIdFields.Length > 0);
29+
_identifyingFields = entryIdFields;
30+
}
31+
2632
public readonly Dictionary<string, HashSet<string>> Duplicates = new Dictionary<string, HashSet<string>>();
2733

2834
[MethodImpl(MethodImplOptions.Synchronized)]
@@ -52,11 +58,17 @@ private void AddDuplicate(string file1, string file2)
5258

5359
public void BuildIndexForProjects(string tokenizedFilesPath)
5460
{
55-
var allFiles = Directory.GetFiles(tokenizedFilesPath, "*.jsonl.gz");
61+
var allFiles = Directory.GetFiles(tokenizedFilesPath, "*.gz")
62+
.Select(f=> Path.Combine(tokenizedFilesPath, f));
63+
BuildIndexFromFiles(allFiles);
64+
}
65+
66+
public void BuildIndexFromFiles(IEnumerable<string> allFiles)
67+
{
5668
foreach (var projectDir in allFiles)
5769
{
5870
Console.WriteLine($"Indexing project {projectDir}");
59-
BuildIndexForProject(Path.Combine(tokenizedFilesPath, projectDir));
71+
BuildIndexForProject(projectDir);
6072
}
6173
}
6274

@@ -77,14 +89,15 @@ public void BuildIndexForProject(string parsedJsonlPath)
7789
line = text.ReadLine();
7890
continue;
7991
}
80-
var tokenData = JsonConvert.DeserializeObject<TokenData>(line);
81-
var tokenCounter = Count(tokenData.tokens);
92+
var tokenData = JsonConvert.DeserializeObject<IDictionary<string, object>>(line);
93+
var tokenCounter = Count(((JArray)tokenData[_tokensFieldName]).Select(t=>t.ToString()));
8294

8395
if (tokenCounter.Sum(tc => tc.Count) >= MIN_NUM_TOKENS_FOR_FILE)
8496
{
8597
var spVect = new SparseVector();
8698
spVect.AddElements(tokenCounter.Select(tc => (_dict.AddOrGet(tc.Token), tc.Count)));
87-
projectIndex[tokenData.filename] = spVect;
99+
var entryIdentifier = string.Join(":", _identifyingFields.Select(idf => tokenData[idf].ToString()));
100+
projectIndex[entryIdentifier] = spVect;
88101
}
89102
line = text.ReadLine();
90103
}
+39-24
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,60 @@
1-
using System;
1+
using DocoptNet;
2+
using System;
3+
using System.Collections.Generic;
24
using System.IO;
35
using System.Linq;
46

57
namespace NearCloneDetector
68
{
79
class CloneDetectorCli
810
{
9-
static void Main(string[] args)
11+
private const string usage = @"Near Clone Detector.
12+
13+
Usage:
14+
CloneDetectorCli [options] (--dir=<folder> | --input=<file>) <output-file-prefix>
15+
16+
Options:
17+
-h --help Show this screen.
18+
--dir=<path> Directory where .jsonl.gz files live.
19+
--input=<path> The path to the input .jsonl.gz file.
20+
--id-fields=<fields> A colon (:)-separated list of names of fields that form the identity of each entry [default: filename].
21+
--tokens-field=<name> The name of the field containing the tokens of the code [default: tokens].
22+
--key-jaccard-threshold=<val> The Jaccard similarity threshold for token-sets [default: 0.8].
23+
--jaccard-threshold=<val> The Jaccard similarity threshold for token multisets [default: 0.7].
24+
25+
";
26+
27+
static void Main(string[] args)
28+
{
29+
var arguments = new Docopt().Apply(usage, args, version: "Near Clone Detector", exit: true);
30+
DetectClones(arguments);
31+
}
32+
33+
34+
35+
public static void DetectClones(IDictionary<string, ValueObject> arguments)
1036
{
11-
if (args.Length != 3)
37+
var cd = new CloneDetector(arguments["--tokens-field"].ToString(), arguments["--id-fields"].ToString().Split(':'));
38+
if (arguments.TryGetValue("--dir", out var dataDirectory) && dataDirectory != null)
1239
{
13-
Console.WriteLine("Usage detect <rootDir> <clonesOutputFilePrefix>");
14-
return;
40+
cd.BuildIndexForProjects(dataDirectory.ToString());
1541
}
16-
var rootDir = args[1];
17-
var clonesOutputFilePrefix = args[2];
18-
19-
if (args[0] == "detect")
42+
else if (arguments.TryGetValue("--input", out var dataFile) && dataFile != null)
2043
{
21-
DetectClones(rootDir, clonesOutputFilePrefix);
44+
cd.BuildIndexFromFiles(new[] { dataFile.ToString() });
2245
}
2346
else
2447
{
25-
throw new NotSupportedException($"Unsupported option {args[0]}");
48+
throw new Exception("Either --dir or --input need to be provided.");
2649
}
27-
}
28-
29-
30-
31-
public static void DetectClones(string rootDir, string clonesOutputFilePrefix,
32-
double keyJaccardSimilarityThreshold = 0.8, double jaccardSimilarityThreshold = 0.7)
33-
{
34-
var cd = new CloneDetector();
35-
cd.BuildIndexForProjects(rootDir);
36-
50+
3751
Console.WriteLine($"[{DateTime.Now}] Searching for near duplicates...");
3852
var startTime = DateTime.Now;
39-
using (var writer = new StreamWriter(clonesOutputFilePrefix + ".txt"))
53+
using (var writer = new StreamWriter(arguments["<output-file-prefix>"].ToString() + ".log"))
4054
{
4155
foreach (var (File1, File2, JaccardSimilarity, KeyJacardSimilarity) in
42-
cd.FindNearDuplicates(keyJaccardSimilarityThreshold, jaccardSimilarityThreshold))
56+
cd.FindNearDuplicates(double.Parse(arguments["--key-jaccard-threshold"].ToString()),
57+
double.Parse(arguments["--jaccard-threshold"].ToString())))
4358
{
4459
Console.WriteLine($"Near duplicate: ({File1}-->{File2}) [scores: {JaccardSimilarity: #.##}, {KeyJacardSimilarity: #.#}]");
4560
writer.WriteLine($"{File1},{File2},{JaccardSimilarity},{KeyJacardSimilarity}");
@@ -51,7 +66,7 @@ public static void DetectClones(string rootDir, string clonesOutputFilePrefix,
5166
Console.WriteLine($"Duplicate search took {elapsedTime}.");
5267

5368
var cloneGroups = new CloneGroups(cd.Duplicates.SelectMany(c => c.Value.Select(f => (c.Key, f))));
54-
cloneGroups.SaveToJson(clonesOutputFilePrefix + ".json");
69+
cloneGroups.SaveToJson(arguments["<output-file-prefix>"].ToString() + ".json");
5570
}
5671
}
5772
}

DuplicateCodeDetector/DuplicateCodeDetector.csproj

+1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
</PropertyGroup>
88

99
<ItemGroup>
10+
<PackageReference Include="docopt.net" Version="0.6.1.10" />
1011
<PackageReference Include="Newtonsoft.Json" Version="11.0.2" />
1112
</ItemGroup>
1213

README.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ This cross-platform sample tool detects exact and near duplicates of code mainta
66

77
To run the near-duplicate detection run:
88
```
9-
$ dotnet run /path/to/DuplicateCodeDetector.csproj detect path/to/dataFolder outputFile
9+
$ dotnet run /path/to/DuplicateCodeDetector.csproj [options] --dir=<folder> <output-file-prefix>
1010
```
11-
This will use all the `.jsonl.gz` files in the `dataFolder` and output an `outputFile` with the duplicate pairs and an `outputFile.json` with the groups of detected duplicates.
11+
This will use all the `.gz` files in the `<folder>` and output an `<output-file-prefix>.json` with the groups of detected duplicates. Invoke `--help` for more options.
1212

1313
### Input Data
1414

@@ -19,6 +19,7 @@ The input data should be one or more `.jsonl.gz` files. These are compressed [JS
1919
"tokens" : ["list", "of", "tokens", "in", "file"]
2020
}
2121
```
22+
Alternative formats can be accepted by providing the `--tokens-field` and `--id-fields` options.
2223

2324
The `tokenizers` folder in this repository contains tokenizers for
2425
C\#, Java, JavaScript and Python. Please, feel free to contribute tokenizers for other languages too.

0 commit comments

Comments
 (0)