|
| 1 | +using Newtonsoft.Json; |
| 2 | +using System; |
| 3 | +using System.Collections.Concurrent; |
| 4 | +using System.Collections.Generic; |
| 5 | +using System.IO; |
| 6 | +using System.IO.Compression; |
| 7 | +using System.Linq; |
| 8 | +using System.Runtime.CompilerServices; |
| 9 | +using System.Text; |
| 10 | +using System.Threading.Tasks; |
| 11 | + |
| 12 | +namespace NearCloneDetector |
| 13 | +{ |
| 14 | + class CloneDetector |
| 15 | + { |
| 16 | + public struct TokenData |
| 17 | + { |
| 18 | + public string filename; |
| 19 | + public string[] tokens; |
| 20 | + } |
| 21 | + |
| 22 | + private readonly Dictionary<string, Dictionary<string, SparseVector>> _index = new Dictionary<string, Dictionary<string, SparseVector>>(); |
| 23 | + public int NumFiles => _index.Sum(prj => prj.Value.Count); |
| 24 | + private readonly FeatureDictionary _dict = new FeatureDictionary(); |
| 25 | + |
| 26 | + public readonly Dictionary<string, HashSet<string>> Duplicates = new Dictionary<string, HashSet<string>>(); |
| 27 | + |
| 28 | + [MethodImpl(MethodImplOptions.Synchronized)] |
| 29 | + private void AddDuplicate(string file1, string file2) |
| 30 | + { |
| 31 | + if (!Duplicates.TryGetValue(file1, out var fileDups)) |
| 32 | + { |
| 33 | + fileDups = new HashSet<string>(); |
| 34 | + Duplicates.Add(file1, fileDups); |
| 35 | + } |
| 36 | + fileDups.Add(file2); |
| 37 | + } |
| 38 | + |
| 39 | + private static IEnumerable<(string Token, int Count)> Count(IEnumerable<string> tokens) |
| 40 | + { |
| 41 | + var allCounts = new Dictionary<string, int>(); |
| 42 | + foreach (var token in tokens) |
| 43 | + { |
| 44 | + if (!allCounts.TryGetValue(token, out var currentCount)) |
| 45 | + { |
| 46 | + currentCount = 0; |
| 47 | + } |
| 48 | + allCounts[token] = currentCount + 1; |
| 49 | + } |
| 50 | + return allCounts.Select(kv => (kv.Key, kv.Value)); |
| 51 | + } |
| 52 | + |
| 53 | + public void BuildIndexForProjects(string tokenizedFilesPath) |
| 54 | + { |
| 55 | + var allFiles = Directory.GetFiles(tokenizedFilesPath, "*.jsonl.gz"); |
| 56 | + foreach (var projectDir in allFiles) |
| 57 | + { |
| 58 | + Console.WriteLine($"Indexing project {projectDir}"); |
| 59 | + BuildIndexForProject(Path.Combine(tokenizedFilesPath, projectDir)); |
| 60 | + } |
| 61 | + } |
| 62 | + |
| 63 | + public void BuildIndexForProject(string parsedJsonlPath) |
| 64 | + { |
| 65 | + var projectIndex = new Dictionary<string, SparseVector>(); |
| 66 | + _index.Add(parsedJsonlPath, projectIndex); |
| 67 | + |
| 68 | + using (var stream = new FileStream(parsedJsonlPath, FileMode.Open)) |
| 69 | + using (var uncompressed = new GZipStream(stream, CompressionMode.Decompress)) |
| 70 | + using (var text = new StreamReader(uncompressed)) |
| 71 | + { |
| 72 | + string line = text.ReadLine(); |
| 73 | + while (line != null) |
| 74 | + { |
| 75 | + if (line == "null") |
| 76 | + { |
| 77 | + line = text.ReadLine(); |
| 78 | + continue; |
| 79 | + } |
| 80 | + var tokenData = JsonConvert.DeserializeObject<TokenData>(line); |
| 81 | + var tokenCounter = Count(tokenData.tokens); |
| 82 | + |
| 83 | + if (tokenCounter.Sum(tc => tc.Count) >= MIN_NUM_TOKENS_FOR_FILE) |
| 84 | + { |
| 85 | + var spVect = new SparseVector(); |
| 86 | + spVect.AddElements(tokenCounter.Select(tc => (_dict.AddOrGet(tc.Token), tc.Count))); |
| 87 | + projectIndex[tokenData.filename] = spVect; |
| 88 | + } |
| 89 | + line = text.ReadLine(); |
| 90 | + } |
| 91 | + } |
| 92 | + } |
| 93 | + |
| 94 | + private IEnumerable<(string Project1, string Project2)> GetAllProjectCombinations() |
| 95 | + { |
| 96 | + var allProjects = _index.Keys.ToArray(); |
| 97 | + for (int i = 0; i < allProjects.Length; i++) |
| 98 | + { |
| 99 | + for (int j = i + 1; j < allProjects.Length; j++) |
| 100 | + { |
| 101 | + yield return (allProjects[i], allProjects[j]); |
| 102 | + } |
| 103 | + yield return (allProjects[i], allProjects[i]); |
| 104 | + } |
| 105 | + } |
| 106 | + |
| 107 | + public IEnumerable<(string File1, string File2, double JaccardSimilarity, double KeyJacardSimilarity)> FindNearDuplicates(double keyJaccardThreshold, double jaccardThreshold) |
| 108 | + { |
| 109 | + return GetAllProjectCombinations().AsParallel().SelectMany(projs => FindNearDuplicates(keyJaccardThreshold, jaccardThreshold, projs.Project1, projs.Project2)); |
| 110 | + } |
| 111 | + |
| 112 | + private readonly ConcurrentDictionary<string, bool> _alreadyDuplicatedFiles = new ConcurrentDictionary<string, bool>(); |
| 113 | + private const int MIN_NUM_TOKENS_FOR_FILE = 20; |
| 114 | + |
| 115 | + private IEnumerable<(string File1, string File2, double JaccardSimilarity, double KeyJacardSimilarity)> FindNearDuplicates(double keyJaccardThreshold, double jaccardThreshold, string project1, string project2) |
| 116 | + { |
| 117 | + return _index[project1].AsParallel().Where(f => !_alreadyDuplicatedFiles.ContainsKey(f.Key)).SelectMany(fileInProject1 => |
| 118 | + { |
| 119 | + IEnumerable<(string File1, string File2, double JaccardSimilarity, double KeyJacardSimilarity)> ComputeSimilarity() |
| 120 | + { |
| 121 | + foreach (var fileInProject2 in _index[project2].Where(f => !_alreadyDuplicatedFiles.ContainsKey(f.Key))) |
| 122 | + { |
| 123 | + if (fileInProject1.Key.Equals(fileInProject2.Key)) |
| 124 | + { |
| 125 | + continue; // The file is itself |
| 126 | + } |
| 127 | + var keyJaccardSimilarity = fileInProject1.Value.KeyJaccardSimilarity(fileInProject2.Value); |
| 128 | + if (keyJaccardSimilarity < keyJaccardThreshold) continue; |
| 129 | + |
| 130 | + var jaccardSimilarity = fileInProject1.Value.JaccardSimilarity(fileInProject2.Value); |
| 131 | + if (jaccardSimilarity < jaccardThreshold) continue; |
| 132 | + |
| 133 | + _alreadyDuplicatedFiles.TryAdd(fileInProject2.Key, true); |
| 134 | + AddDuplicate(fileInProject1.Key, fileInProject2.Key); |
| 135 | + yield return (fileInProject1.Key, fileInProject2.Key, jaccardSimilarity, keyJaccardSimilarity); |
| 136 | + } |
| 137 | + } |
| 138 | + return ComputeSimilarity(); |
| 139 | + }); |
| 140 | + } |
| 141 | + } |
| 142 | +} |
0 commit comments