Skip to content

Commit 6b8f409

Browse files
author
Miltos Allamanis
committed
Initial commit for open-sourcing.
1 parent f3c98db commit 6b8f409

8 files changed

+443
-0
lines changed
+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
using Newtonsoft.Json;
2+
using System;
3+
using System.Collections.Concurrent;
4+
using System.Collections.Generic;
5+
using System.IO;
6+
using System.IO.Compression;
7+
using System.Linq;
8+
using System.Runtime.CompilerServices;
9+
using System.Text;
10+
using System.Threading.Tasks;
11+
12+
namespace NearCloneDetector
13+
{
14+
class CloneDetector
15+
{
16+
public struct TokenData
17+
{
18+
public string filename;
19+
public string[] tokens;
20+
}
21+
22+
private readonly Dictionary<string, Dictionary<string, SparseVector>> _index = new Dictionary<string, Dictionary<string, SparseVector>>();
23+
public int NumFiles => _index.Sum(prj => prj.Value.Count);
24+
private readonly FeatureDictionary _dict = new FeatureDictionary();
25+
26+
public readonly Dictionary<string, HashSet<string>> Duplicates = new Dictionary<string, HashSet<string>>();
27+
28+
[MethodImpl(MethodImplOptions.Synchronized)]
29+
private void AddDuplicate(string file1, string file2)
30+
{
31+
if (!Duplicates.TryGetValue(file1, out var fileDups))
32+
{
33+
fileDups = new HashSet<string>();
34+
Duplicates.Add(file1, fileDups);
35+
}
36+
fileDups.Add(file2);
37+
}
38+
39+
private static IEnumerable<(string Token, int Count)> Count(IEnumerable<string> tokens)
40+
{
41+
var allCounts = new Dictionary<string, int>();
42+
foreach (var token in tokens)
43+
{
44+
if (!allCounts.TryGetValue(token, out var currentCount))
45+
{
46+
currentCount = 0;
47+
}
48+
allCounts[token] = currentCount + 1;
49+
}
50+
return allCounts.Select(kv => (kv.Key, kv.Value));
51+
}
52+
53+
public void BuildIndexForProjects(string tokenizedFilesPath)
54+
{
55+
var allFiles = Directory.GetFiles(tokenizedFilesPath, "*.jsonl.gz");
56+
foreach (var projectDir in allFiles)
57+
{
58+
Console.WriteLine($"Indexing project {projectDir}");
59+
BuildIndexForProject(Path.Combine(tokenizedFilesPath, projectDir));
60+
}
61+
}
62+
63+
public void BuildIndexForProject(string parsedJsonlPath)
64+
{
65+
var projectIndex = new Dictionary<string, SparseVector>();
66+
_index.Add(parsedJsonlPath, projectIndex);
67+
68+
using (var stream = new FileStream(parsedJsonlPath, FileMode.Open))
69+
using (var uncompressed = new GZipStream(stream, CompressionMode.Decompress))
70+
using (var text = new StreamReader(uncompressed))
71+
{
72+
string line = text.ReadLine();
73+
while (line != null)
74+
{
75+
if (line == "null")
76+
{
77+
line = text.ReadLine();
78+
continue;
79+
}
80+
var tokenData = JsonConvert.DeserializeObject<TokenData>(line);
81+
var tokenCounter = Count(tokenData.tokens);
82+
83+
if (tokenCounter.Sum(tc => tc.Count) >= MIN_NUM_TOKENS_FOR_FILE)
84+
{
85+
var spVect = new SparseVector();
86+
spVect.AddElements(tokenCounter.Select(tc => (_dict.AddOrGet(tc.Token), tc.Count)));
87+
projectIndex[tokenData.filename] = spVect;
88+
}
89+
line = text.ReadLine();
90+
}
91+
}
92+
}
93+
94+
private IEnumerable<(string Project1, string Project2)> GetAllProjectCombinations()
95+
{
96+
var allProjects = _index.Keys.ToArray();
97+
for (int i = 0; i < allProjects.Length; i++)
98+
{
99+
for (int j = i + 1; j < allProjects.Length; j++)
100+
{
101+
yield return (allProjects[i], allProjects[j]);
102+
}
103+
yield return (allProjects[i], allProjects[i]);
104+
}
105+
}
106+
107+
public IEnumerable<(string File1, string File2, double JaccardSimilarity, double KeyJacardSimilarity)> FindNearDuplicates(double keyJaccardThreshold, double jaccardThreshold)
108+
{
109+
return GetAllProjectCombinations().AsParallel().SelectMany(projs => FindNearDuplicates(keyJaccardThreshold, jaccardThreshold, projs.Project1, projs.Project2));
110+
}
111+
112+
private readonly ConcurrentDictionary<string, bool> _alreadyDuplicatedFiles = new ConcurrentDictionary<string, bool>();
113+
private const int MIN_NUM_TOKENS_FOR_FILE = 20;
114+
115+
private IEnumerable<(string File1, string File2, double JaccardSimilarity, double KeyJacardSimilarity)> FindNearDuplicates(double keyJaccardThreshold, double jaccardThreshold, string project1, string project2)
116+
{
117+
return _index[project1].AsParallel().Where(f => !_alreadyDuplicatedFiles.ContainsKey(f.Key)).SelectMany(fileInProject1 =>
118+
{
119+
IEnumerable<(string File1, string File2, double JaccardSimilarity, double KeyJacardSimilarity)> ComputeSimilarity()
120+
{
121+
foreach (var fileInProject2 in _index[project2].Where(f => !_alreadyDuplicatedFiles.ContainsKey(f.Key)))
122+
{
123+
if (fileInProject1.Key.Equals(fileInProject2.Key))
124+
{
125+
continue; // The file is itself
126+
}
127+
var keyJaccardSimilarity = fileInProject1.Value.KeyJaccardSimilarity(fileInProject2.Value);
128+
if (keyJaccardSimilarity < keyJaccardThreshold) continue;
129+
130+
var jaccardSimilarity = fileInProject1.Value.JaccardSimilarity(fileInProject2.Value);
131+
if (jaccardSimilarity < jaccardThreshold) continue;
132+
133+
_alreadyDuplicatedFiles.TryAdd(fileInProject2.Key, true);
134+
AddDuplicate(fileInProject1.Key, fileInProject2.Key);
135+
yield return (fileInProject1.Key, fileInProject2.Key, jaccardSimilarity, keyJaccardSimilarity);
136+
}
137+
}
138+
return ComputeSimilarity();
139+
});
140+
}
141+
}
142+
}
+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
using System;
2+
using System.IO;
3+
using System.Linq;
4+
5+
namespace NearCloneDetector
6+
{
7+
class CloneDetectorCli
8+
{
9+
static void Main(string[] args)
10+
{
11+
if (args.Length != 3)
12+
{
13+
Console.WriteLine("Usage detect <rootDir> <clonesOutputFilePrefix>");
14+
return;
15+
}
16+
var rootDir = args[1];
17+
var clonesOutputFilePrefix = args[2];
18+
19+
if (args[0] == "detect")
20+
{
21+
DetectClones(rootDir, clonesOutputFilePrefix);
22+
}
23+
else
24+
{
25+
throw new NotSupportedException($"Unsupported option {args[0]}");
26+
}
27+
}
28+
29+
30+
31+
public static void DetectClones(string rootDir, string clonesOutputFilePrefix,
32+
double keyJaccardSimilarityThreshold = 0.8, double jaccardSimilarityThreshold = 0.7)
33+
{
34+
var cd = new CloneDetector();
35+
cd.BuildIndexForProjects(rootDir);
36+
37+
Console.WriteLine($"[{DateTime.Now}] Searching for near duplicates...");
38+
var startTime = DateTime.Now;
39+
using (var writer = new StreamWriter(clonesOutputFilePrefix + ".txt"))
40+
{
41+
foreach (var (File1, File2, JaccardSimilarity, KeyJacardSimilarity) in
42+
cd.FindNearDuplicates(keyJaccardSimilarityThreshold, jaccardSimilarityThreshold))
43+
{
44+
Console.WriteLine($"Near duplicate: ({File1}-->{File2}) [scores: {JaccardSimilarity: #.##}, {KeyJacardSimilarity: #.#}]");
45+
writer.WriteLine($"{File1},{File2},{JaccardSimilarity},{KeyJacardSimilarity}");
46+
}
47+
}
48+
49+
var elapsedTime = DateTime.Now - startTime;
50+
Console.WriteLine($"Finished looking for duplicates in {cd.NumFiles} files.");
51+
Console.WriteLine($"Duplicate search took {elapsedTime}.");
52+
53+
var cloneGroups = new CloneGroups(cd.Duplicates.SelectMany(c => c.Value.Select(f => (c.Key, f))));
54+
cloneGroups.SaveToJson(clonesOutputFilePrefix + ".json");
55+
}
56+
}
57+
}

DuplicateCodeDetector/CloneGroups.cs

+87
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
using Newtonsoft.Json;
2+
using System;
3+
using System.Collections.Generic;
4+
using System.IO;
5+
using System.Linq;
6+
using System.Text;
7+
using System.Threading.Tasks;
8+
9+
namespace NearCloneDetector
10+
{
11+
public class CloneGroups
12+
{
13+
public readonly Dictionary<string, HashSet<string>> FileToCloneSet = new Dictionary<string, HashSet<string>>();
14+
public readonly List<HashSet<string>> CloneSets = new List<HashSet<string>>();
15+
16+
private void AddElementNonTransitive(string baseFile, string targetFile)
17+
{
18+
if (!FileToCloneSet.TryGetValue(baseFile, out var file1Clones))
19+
{
20+
file1Clones = new HashSet<string>();
21+
FileToCloneSet.Add(baseFile, file1Clones);
22+
}
23+
file1Clones.Add(targetFile);
24+
}
25+
26+
public CloneGroups(IEnumerable<(string File1, string File2)> clonePairs)
27+
{
28+
foreach (var (f1, f2) in clonePairs)
29+
{
30+
AddElementNonTransitive(f1, f2);
31+
AddElementNonTransitive(f2, f1);
32+
}
33+
Console.WriteLine($"Found {FileToCloneSet.Count} files that are cloned.");
34+
var numCloneClusters = MakeCloneSetTransitive();
35+
Console.WriteLine($"Number of unique clone clusters {numCloneClusters}");
36+
37+
var duplicationFactors = CloneSets.Select(c => c.Count).ToList();
38+
39+
Console.WriteLine($"Avg Duplication Factor: {duplicationFactors.Average()}");
40+
duplicationFactors.Sort();
41+
double median;
42+
int midpoint = duplicationFactors.Count / 2;
43+
if (duplicationFactors.Count % 2 == 0)
44+
{
45+
median = (duplicationFactors[midpoint] + duplicationFactors[midpoint + 1]) / 2;
46+
}
47+
else
48+
{
49+
median = duplicationFactors[midpoint];
50+
}
51+
Console.WriteLine($"Median Duplication Factor: {median}");
52+
}
53+
54+
private int MakeCloneSetTransitive()
55+
{
56+
var filesToVisit = new HashSet<string>(FileToCloneSet.Keys);
57+
int numCloneSets = 0;
58+
59+
while (filesToVisit.Count > 0)
60+
{
61+
var cloneSet = new HashSet<string>() { filesToVisit.First() };
62+
int lastCloneSetSize;
63+
64+
do
65+
{
66+
lastCloneSetSize = cloneSet.Count;
67+
cloneSet = new HashSet<string>(cloneSet.SelectMany(c => FileToCloneSet[c]).Union(cloneSet));
68+
}
69+
while (lastCloneSetSize != cloneSet.Count);
70+
71+
numCloneSets += 1;
72+
CloneSets.Add(cloneSet);
73+
foreach (var f in cloneSet)
74+
{
75+
FileToCloneSet[f] = cloneSet;
76+
filesToVisit.Remove(f);
77+
}
78+
}
79+
return numCloneSets;
80+
}
81+
82+
public void SaveToJson(string filename)
83+
{
84+
File.WriteAllText(filename, JsonConvert.SerializeObject(CloneSets));
85+
}
86+
}
87+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>netcoreapp2.1</TargetFramework>
6+
<StartupObject>NearCloneDetector.CloneDetectorCli</StartupObject>
7+
</PropertyGroup>
8+
9+
<ItemGroup>
10+
<PackageReference Include="Newtonsoft.Json" Version="11.0.2" />
11+
</ItemGroup>
12+
13+
</Project>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
2+
Microsoft Visual Studio Solution File, Format Version 12.00
3+
# Visual Studio 15
4+
VisualStudioVersion = 15.0.28010.2048
5+
MinimumVisualStudioVersion = 10.0.40219.1
6+
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DuplicateCodeDetector", "DuplicateCodeDetector.csproj", "{3156A622-E72D-4974-A6FB-F7E679B82ACC}"
7+
EndProject
8+
Global
9+
GlobalSection(SolutionConfigurationPlatforms) = preSolution
10+
Debug|Any CPU = Debug|Any CPU
11+
Release|Any CPU = Release|Any CPU
12+
EndGlobalSection
13+
GlobalSection(ProjectConfigurationPlatforms) = postSolution
14+
{3156A622-E72D-4974-A6FB-F7E679B82ACC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15+
{3156A622-E72D-4974-A6FB-F7E679B82ACC}.Debug|Any CPU.Build.0 = Debug|Any CPU
16+
{3156A622-E72D-4974-A6FB-F7E679B82ACC}.Release|Any CPU.ActiveCfg = Release|Any CPU
17+
{3156A622-E72D-4974-A6FB-F7E679B82ACC}.Release|Any CPU.Build.0 = Release|Any CPU
18+
EndGlobalSection
19+
GlobalSection(SolutionProperties) = preSolution
20+
HideSolutionNode = FALSE
21+
EndGlobalSection
22+
GlobalSection(ExtensibilityGlobals) = postSolution
23+
SolutionGuid = {617E87BF-E065-484D-A6D2-0FD0004C3E34}
24+
EndGlobalSection
25+
EndGlobal
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
using System.Collections.Generic;
2+
3+
namespace NearCloneDetector
4+
{
5+
class FeatureDictionary
6+
{
7+
private readonly List<string> _idToToken = new List<string>();
8+
private readonly Dictionary<string, int> _tokenToId = new Dictionary<string, int>();
9+
10+
public int AddOrGet(string token)
11+
{
12+
if(_tokenToId.TryGetValue(token, out var id))
13+
{
14+
return id;
15+
}
16+
id = _idToToken.Count;
17+
_idToToken.Add(token);
18+
_tokenToId.Add(token, id);
19+
return id;
20+
}
21+
22+
public int Get(string token)
23+
{
24+
return _tokenToId[token];
25+
}
26+
27+
public string Get(int id)
28+
{
29+
return _idToToken[id];
30+
}
31+
}
32+
}

0 commit comments

Comments
 (0)