diff --git a/cli/src/cli/commands/run.ts b/cli/src/cli/commands/run.ts index dce6e7199..fbe5adf61 100644 --- a/cli/src/cli/commands/run.ts +++ b/cli/src/cli/commands/run.ts @@ -57,6 +57,13 @@ export function runCommand(program: Command): Command { ), x => parseFloat(x), ) + .option( + "-i, --ignore ", + Utils.indent( + "Path of a file with template/boilerplate code. " + + "Code fragments matching with this file will be ignored." + ) + ) .option( "-L, --limit-results ", Utils.indent( @@ -171,6 +178,7 @@ interface RunOptions extends Options { host: string; outputFormat: string; outputDestination: string; + ignore: string; } export async function run(locations: string[], options: RunOptions): Promise { @@ -199,7 +207,7 @@ export async function run(locations: string[], options: RunOptions): Promise 0) { report.warnings.forEach(warn => warning(warn)); diff --git a/cli/src/cli/views/fileView.ts b/cli/src/cli/views/fileView.ts index abd39b0b5..89ca06b9a 100644 --- a/cli/src/cli/views/fileView.ts +++ b/cli/src/cli/views/fileView.ts @@ -77,6 +77,7 @@ export class FileView extends View { { "id": s => s.id, "hash": s => s.hash, + "ignored": s => s.ignored ? "true" : "false", "data": s => s.kgram?.join(" ") || null, "files": s => JSON.stringify(s.files().map(f => f.id)) }); @@ -85,9 +86,10 @@ export class FileView extends View { public writeFiles(out: Writable): void { writeCSVto( out, - this.report.entries(), + this.report.entries().concat(this.report.ignoredEntries()), { "id": f => f.file.id, + "ignored": f => f.isIgnored ? "true" : "false", "path": f => f.file.path, "content": f => f.file.content, "amountOfKgrams": f => f.kgrams.length, diff --git a/core/src/algorithm/fingerprintIndex.ts b/core/src/algorithm/fingerprintIndex.ts index e35c19850..0210871bb 100644 --- a/core/src/algorithm/fingerprintIndex.ts +++ b/core/src/algorithm/fingerprintIndex.ts @@ -12,8 +12,10 @@ export type Hash = number; export interface FileEntry { file: TokenizedFile; - kgrams: Array, + kgrams: Array; shared: Set; + ignored: Set; + isIgnored: boolean; } export interface Occurrence { @@ -22,9 +24,16 @@ export interface Occurrence { } export class FingerprintIndex { + // HashFilter transforms tokens into (a selection of) hashes private readonly hashFilter: HashFilter; + // A map of file id to FileEntry object that has been analysed private readonly files: Map; + // A map of file id to FileEntry object that is ignored (e.g. template code) + private readonly ignoredFiles: Map; + // A map of hashes to their Shared Fingerprints (which keep track of the files they are in) private readonly index: Map; + // A set of ignored hashes (either manually added, or through the ignored files, NOT because of maxFileCount) + private readonly ignoredHashes: Set; /** * Creates a Fingerprint Index which is able to compare files with each other @@ -34,11 +43,48 @@ export class FingerprintIndex { constructor( private readonly kgramLength: number, private readonly kgramsInWindow: number, - kgramData?: boolean + kgramData?: boolean, + private maxFingerprintFileCount = Number.MAX_SAFE_INTEGER, ) { this.hashFilter = new WinnowFilter(this.kgramLength, this.kgramsInWindow, kgramData); this.files = new Map(); + this.ignoredFiles = new Map(); this.index = new Map(); + this.ignoredHashes = new Set(); + } + + public addIgnoredFile(file: TokenizedFile): void { + assert(!this.ignoredFiles.has(file.id), `This file has already been ignored: ${file.file.path}`); + const entry: FileEntry = { + file, + kgrams: [], + isIgnored: true, + shared: new Set(), + ignored: new Set() + }; + + this.ignoredFiles.set(file.id, entry); + this.addEntry(entry); + } + + public getMaxFingerprintFileCount(): number { + return this.maxFingerprintFileCount; + } + + public updateMaxFingerprintFileCount(maxFingerprintFileCount: number | undefined): void { + if (maxFingerprintFileCount == this.maxFingerprintFileCount) { + return; + } + this.maxFingerprintFileCount = maxFingerprintFileCount || Number.MAX_SAFE_INTEGER; + for (const shared of this.index.values()) { + if (!this.ignoredHashes.has(shared.hash)) { + if (shared.fileCount() > this.maxFingerprintFileCount && !shared.ignored) { + this.ignoreSharedFingerprint(shared); + } else if (shared.fileCount() <= this.maxFingerprintFileCount && shared.ignored) { + this.unIgnoreSharedFingerprint(shared); + } + } + } } public addFiles(tokenizedFiles: TokenizedFile[]): Map { @@ -48,69 +94,108 @@ export class FingerprintIndex { } for (const file of tokenizedFiles) { - let kgram = 0; - const entry: FileEntry = { file, kgrams: [], - shared: new Set() + isIgnored: false, + shared: new Set(), + ignored: new Set() }; this.files.set(file.id, entry); + this.addEntry(entry); + } - for ( - const { data, hash, start, stop } - of this.hashFilter.fingerprints(file.tokens) - ) { - - // add kgram to file - entry.kgrams.push(new Range(start, stop)); - - // sanity check - assert( - Region.isInOrder( - file.mapping[start], - file.mapping[stop] - ) - // If we end our kgram on a ')', the location of the opening token is used. - // However, the location of this token in the file might be before - // the location of the starting token of the kmer - // For example: the last token of every ast is ')', closing the program. - // The location of this token is always (0, 0), since the program root is the first token. - // In this way, the 'end' token is before any other token in the AST. - || file.tokens[stop] === ")" , - `Invalid ordering: - expected ${file.mapping[start]} - to start be before the end of ${file.mapping[stop]}` - ); + return this.index; + } + + private addEntry(entry: FileEntry): void { + const file = entry.file; + let kgram = 0; + for ( + const { data, hash, start, stop } + of this.hashFilter.fingerprints(file.tokens) + ) { - const location = Region.merge( + // add kgram to file + entry.kgrams.push(new Range(start, stop)); + + // sanity check + assert( + Region.isInOrder( file.mapping[start], file.mapping[stop] - ); + ) + // If we end our kgram on a ')', the location of the opening token is used. + // However, the location of this token in the file might be before + // the location of the starting token of the kmer + // For example: the last token of every ast is ')', closing the program. + // The location of this token is always (0, 0), since the program root is the first token. + // In this way, the 'end' token is before any other token in the AST. + || file.tokens[stop] === ")" , + `Invalid ordering: + expected ${file.mapping[start]} + to start be before the end of ${file.mapping[stop]}` + ); - const part: Occurrence = { - file, - side: { index: kgram, start, stop, data, location } - }; + const location = Region.merge( + file.mapping[start], + file.mapping[stop] + ); - // look if the index already contains the given hashing - let shared: SharedFingerprint | undefined = this.index.get(hash); + const part: Occurrence = { + file, + side: { index: kgram, start, stop, data, location } + }; - if (!shared) { - // if the hashing does not yet exist in the index, add it - shared = new SharedFingerprint(hash, data); - this.index.set(hash, shared); - } + // look if the index already contains the given hashing + let shared: SharedFingerprint | undefined = this.index.get(hash); + + if (!shared) { + // if the hashing does not yet exist in the index, add it + shared = new SharedFingerprint(hash, data); + this.index.set(hash, shared); + } - shared.add(part); + shared.add(part); + if (entry.isIgnored || shared.fileCount() > this.maxFingerprintFileCount || this.ignoredHashes.has(hash)) { + this.ignoreSharedFingerprint(shared); + } else { entry.shared.add(shared); + } + + kgram += 1; + } + } - kgram += 1; + public addIgnoredHashes(hashes: Array): void { + for (const hash of hashes) { + this.ignoredHashes.add(hash); + const shared = this.index.get(hash); + if (shared) { + this.ignoreSharedFingerprint(shared); } } + } - return this.index; + private ignoreSharedFingerprint(shared: SharedFingerprint): void { + shared.ignored = true; + for (const other of shared.files()) { + if (!this.ignoredFiles.has(other.id)) { + const otherEntry = this.files.get(other.id)!; + otherEntry.shared.delete(shared); + otherEntry.ignored.add(shared); + } + } + } + + private unIgnoreSharedFingerprint(shared: SharedFingerprint): void { + shared.ignored = false; + for (const other of shared.files()) { + const otherEntry = this.files.get(other.id)!; + otherEntry.ignored.delete(shared); + otherEntry.shared.add(shared); + } } public sharedFingerprints(): Array { @@ -121,6 +206,10 @@ export class FingerprintIndex { return Array.from(this.files.values()); } + public ignoredEntries(): Array { + return Array.from(this.ignoredFiles.values()); + } + public getPair(file1: TokenizedFile, file2: TokenizedFile): Pair { const entry1 = this.files.get(file1.id); const entry2 = this.files.get(file2.id); diff --git a/core/src/algorithm/pair.ts b/core/src/algorithm/pair.ts index e3a88d3f0..278ea0e0c 100644 --- a/core/src/algorithm/pair.ts +++ b/core/src/algorithm/pair.ts @@ -30,6 +30,8 @@ export class Pair extends Identifiable { public readonly rightTotal; public readonly longest; public readonly similarity; + public readonly leftIgnored; + public readonly rightIgnored; constructor( public readonly leftEntry: FileEntry, @@ -71,10 +73,13 @@ export class Pair extends Identifiable { this.leftCovered = left.length; this.rightCovered = right.length; + this.leftIgnored = leftEntry.ignored.size; + this.rightIgnored = leftEntry.ignored.size; this.leftTotal = leftEntry.kgrams.length; this.rightTotal = rightEntry.kgrams.length; - if (this.leftTotal + this.rightTotal > 0) { - this.similarity = (this.leftCovered + this.rightCovered) / (this.leftTotal + this.rightTotal); + const denominator = this.leftTotal + this.rightTotal - this.leftIgnored - this.rightIgnored; + if (denominator > 0) { + this.similarity = (this.leftCovered + this.rightCovered) / denominator; } else { this.similarity = 0; } diff --git a/core/src/algorithm/sharedFingerprint.ts b/core/src/algorithm/sharedFingerprint.ts index 109ed45a5..e4767186d 100644 --- a/core/src/algorithm/sharedFingerprint.ts +++ b/core/src/algorithm/sharedFingerprint.ts @@ -4,6 +4,9 @@ import { Identifiable } from "../util/identifiable.js"; export class SharedFingerprint extends Identifiable { + // Whether this SharedFingerprint occurs in the boilerplate/template code + public ignored: boolean = false; + private partMap: Map> = new Map(); constructor( @@ -40,4 +43,8 @@ export class SharedFingerprint extends Identifiable { public fileCount(): number { return this.partMap.size; } + + public includesFile(file: TokenizedFile): boolean { + return this.partMap.has(file); + } } diff --git a/core/src/test/pair.test.ts b/core/src/test/pair.test.ts index 3ba00e6de..acd7978cb 100644 --- a/core/src/test/pair.test.ts +++ b/core/src/test/pair.test.ts @@ -54,13 +54,17 @@ test("paired occurrence merging & squashing", t => { const left = { kgrams: new Array(), shared: new Set(), - file: leftFile + ignored: new Set(), + file: leftFile, + isIgnored: false, }; const right = { kgrams: new Array(), shared: new Set(), - file: rightFile + ignored: new Set(), + file: rightFile, + isIgnored: false, }; diff --git a/docs/docs/running.md b/docs/docs/running.md index 4f6c9c949..ea7501d28 100644 --- a/docs/docs/running.md +++ b/docs/docs/running.md @@ -49,6 +49,28 @@ You can show all command line options by passing the `-h` or `--help` flag or by You can improve the plagiarism detection report by adding metadata to your submissions (submission time, labels, author name, ...). See the page about [adding metadata](/docs/adding-metadata) to see how. +## Ignoring template code + +Programming exercises often have code in common that is not plagiarised. For example: class and method definitions, given test cases, boilerplate code, ... +Dolos will often detect these code fragments as similar and include them in the similarity score, making it harder to spot actual plagiarism. + +With the `-i ` or `--ignore ` parameter, you can add an _ignore_ file (often also called a _template_ or _boilerplate_) to the analysis. +Code fragments from analysed solutions that match with this file will be ignored and these fingerprints will not count towards similarity. + +In addition, it is also possible to **automatically detect** common code. +By passing `-m ` or `--max-fingerprint-count ` you can specify a maximum number of files a code fragment can occur in before it is ignored. +With `-M ` or `--max-fingerprint-percentage ` it is possible to specify this number as a fraction (percentage) of the total analysed file count. +It is possible to combine this with specifying an ignore file with the `-i` option. + + +Example usage: + +```sh +# Ignore all code fragments occurring in more than half of the files, +# or occurring in template.js +dolos run -M 0.5 -i template.js solutions/*.js +``` + ## Modifying plagiarism detection parameters The plagiarism detection parameters can be altered by passing the relevant arguments when running Dolos. diff --git a/lib/src/lib/dolos.ts b/lib/src/lib/dolos.ts index b657fc72f..015b9b824 100644 --- a/lib/src/lib/dolos.ts +++ b/lib/src/lib/dolos.ts @@ -91,7 +91,7 @@ export class Dolos { } - public async analyzePaths(paths: string[]): Promise { + public async analyzePaths(paths: string[], ignore?: string): Promise { let files = null; let nameCandidate = undefined; if(paths.length == 1) { @@ -113,12 +113,17 @@ export class Dolos { nameCandidate = path.basename(paths[0]) + " & " + path.basename(paths[1]); } } - return this.analyze((await files).ok(), nameCandidate); + let ignoredFile; + if (ignore) { + ignoredFile = (await readPath(ignore)).ok(); + } + return this.analyze((await files).ok(), nameCandidate, ignoredFile); } public async analyze( files: Array, - nameCandidate?: string + nameCandidate?: string, + ignoredFile?: File ): Promise { if (this.index == null) { @@ -131,7 +136,6 @@ export class Dolos { this.tokenizer = await this.language.createTokenizer(); this.index = new FingerprintIndex(this.options.kgramLength, this.options.kgramsInWindow, this.options.kgramData); } - const warnings = []; let filteredFiles; if (this.languageDetected) { @@ -148,18 +152,32 @@ export class Dolos { filteredFiles = files; } - if (files.length < 2) { + if (filteredFiles.length < 2) { throw new Error("You need to supply at least two files"); - } else if (files.length == 2 && this.options.maxFingerprintPercentage !== null) { + } else if (filteredFiles.length == 2 && this.options.maxFingerprintPercentage !== null) { throw new Error("You have given a maximum hash percentage but your are " + "comparing two files. Each matching hash will thus " + "be present in 100% of the files. This option does only" + "make sense when comparing more than two files."); } + let maxFingerprintFileCount = undefined; + if (this.options.maxFingerprintCount != null) { + maxFingerprintFileCount = this.options.maxFingerprintCount; + } else if (this.options.maxFingerprintPercentage != null) { + const total = this.index.entries().length + filteredFiles.length; + maxFingerprintFileCount = this.options.maxFingerprintPercentage * total; + } + + this.index.updateMaxFingerprintFileCount(maxFingerprintFileCount); + // eslint-disable-next-line @typescript-eslint/no-non-null-assertion const tokenizedFiles = filteredFiles.map(f => this.tokenizer!.tokenizeFile(f)); this.index.addFiles(tokenizedFiles); + if (ignoredFile) { + const tokenizedTemplate = this.tokenizer!.tokenizeFile(ignoredFile); + this.index.addIgnoredFile(tokenizedTemplate); + } return new Report( this.options, diff --git a/lib/src/lib/report.ts b/lib/src/lib/report.ts index e80255641..f5068f134 100644 --- a/lib/src/lib/report.ts +++ b/lib/src/lib/report.ts @@ -9,11 +9,8 @@ export interface Metadata extends DolosOptions { } export class Report { - // maximum amount of files a kgram can occur in a file before it is ignored - private readonly kgramMaxFileOccurrences: number; private pairs: Array = []; - private fingerprints: Array = []; public readonly name: string; public readonly createdAt: string = new Date().toISOString(); @@ -26,23 +23,11 @@ export class Report { name?: string, public readonly warnings: string[] = [], ) { - if (this.options.maxFingerprintCount != null) { - this.kgramMaxFileOccurrences = this.options.maxFingerprintCount; - } else if (this.options.maxFingerprintPercentage != null) { - this.kgramMaxFileOccurrences = this.options.maxFingerprintPercentage * this.files.length; - } else { - this.kgramMaxFileOccurrences = this.files.length; - } - if (this.options.reportName) { this.name = this.options.reportName; } else { this.name = name || `${this.files.length} ${ this.language?.name } files`; } - - this.fingerprints = Array.from(index.sharedFingerprints()).filter( - (k: SharedFingerprint) => k.fileCount() <= this.kgramMaxFileOccurrences, - ); } public getPair(file1: TokenizedFile, file2: TokenizedFile): Pair { @@ -57,13 +42,17 @@ export class Report { } public sharedFingerprints(): Array { - return Array.from(this.fingerprints.values()); + return this.index.sharedFingerprints(); } public entries(): Array { return this.index.entries(); } + public ignoredEntries(): Array { + return this.index.ignoredEntries(); + } + public metadata(): Metadata { return { ...this.options.asObject(), diff --git a/lib/src/test/dolos.test.ts b/lib/src/test/dolos.test.ts index 24171227b..ac74d4ca8 100644 --- a/lib/src/test/dolos.test.ts +++ b/lib/src/test/dolos.test.ts @@ -288,6 +288,106 @@ test("should generate warning when not all files match detected language", async t.is(1, pairs.length); }); +test("should ignore template code", async t => { + const dolos = new Dolos(); + + const report = await dolos.analyzePaths([ + "./src/test/fixtures/javascript/boilerplate_copy.js", + "./src/test/fixtures/javascript/implementation-unique.js", + "./src/test/fixtures/javascript/implementation-alternative.js", + "./src/test/fixtures/javascript/implementation-alternative-similar.js", + ], "./src/test/fixtures/javascript/boilerplate_copy.js"); + + const files = report.files; + t.is(4, files.length); + + const boilerplate = files[0]; + const unique = files[1]; + const alternative = files[2]; + const similar = files[3]; + + // Boilerplate copy should not have a match + t.is(0, report.getPair(boilerplate, unique).similarity); + t.is(0, report.getPair(boilerplate, alternative).similarity); + t.is(0, report.getPair(boilerplate, similar).similarity); + + + const unique_alternative = report.getPair(unique, alternative); + const unique_similar = report.getPair(unique, similar); + const alternative_similar = report.getPair(alternative, similar); + t.true(unique_alternative.similarity < alternative_similar.similarity, "Pairs with unique should be less similar"); + t.true(unique_similar.similarity < alternative_similar.similarity, "Pairs with unique should be less similar"); + t.true(alternative_similar.similarity > 0.5, "Pairs with similar code should have a similarity above 50%"); +}); + +test("should ignore with maxFingerprintCount", async t => { + const dolos = new Dolos({ + maxFingerprintCount: 3 + }); + + const report = await dolos.analyzePaths([ + "./src/test/fixtures/javascript/boilerplate_copy.js", + "./src/test/fixtures/javascript/implementation-unique.js", + "./src/test/fixtures/javascript/implementation-alternative.js", + "./src/test/fixtures/javascript/implementation-alternative-similar.js", + ]); + + const files = report.files; + t.is(4, files.length); + + const boilerplate = files[0]; + const unique = files[1]; + const alternative = files[2]; + const similar = files[3]; + + // Boilerplate copy should not have a match + t.is(0, report.getPair(boilerplate, unique).similarity); + t.is(0, report.getPair(boilerplate, alternative).similarity); + t.is(0, report.getPair(boilerplate, similar).similarity); + + + const unique_alternative = report.getPair(unique, alternative); + const unique_similar = report.getPair(unique, similar); + const alternative_similar = report.getPair(alternative, similar); + t.true(unique_alternative.similarity < alternative_similar.similarity, "Pairs with unique should be less similar"); + t.true(unique_similar.similarity < alternative_similar.similarity, "Pairs with unique should be less similar"); + t.true(alternative_similar.similarity > 0.5, "Pairs with similar code should have a similarity above 50%"); +}); + +test("should ignore with maxFingerprintPercentage", async t => { + const dolos = new Dolos({ + maxFingerprintPercentage: 0.75 + }); + + const report = await dolos.analyzePaths([ + "./src/test/fixtures/javascript/boilerplate_copy.js", + "./src/test/fixtures/javascript/implementation-unique.js", + "./src/test/fixtures/javascript/implementation-alternative.js", + "./src/test/fixtures/javascript/implementation-alternative-similar.js", + ]); + + const files = report.files; + t.is(4, files.length); + + const boilerplate = files[0]; + const unique = files[1]; + const alternative = files[2]; + const similar = files[3]; + + // Boilerplate copy should not have a match + t.is(0, report.getPair(boilerplate, unique).similarity); + t.is(0, report.getPair(boilerplate, alternative).similarity); + t.is(0, report.getPair(boilerplate, similar).similarity); + + + const unique_alternative = report.getPair(unique, alternative); + const unique_similar = report.getPair(unique, similar); + const alternative_similar = report.getPair(alternative, similar); + t.true(unique_alternative.similarity < alternative_similar.similarity, "Pairs with unique should be less similar"); + t.true(unique_similar.similarity < alternative_similar.similarity, "Pairs with unique should be less similar"); + t.true(alternative_similar.similarity > 0.5, "Pairs with similar code should have a similarity above 50%"); +}); + test.failing("repeating sequences should not cause too many fragments", async t => { const dolos = new Dolos(); diff --git a/lib/src/test/fixtures/javascript/boilerplate.js b/lib/src/test/fixtures/javascript/boilerplate.js new file mode 100644 index 000000000..e03e7023e --- /dev/null +++ b/lib/src/test/fixtures/javascript/boilerplate.js @@ -0,0 +1,18 @@ +class Pearson { + constructor(tabel, combineer) { + this.tabel = tabel || [...new Array(256).keys()]; + if (this.tabel.length !== 256) { + throw "AssertionError: ongeldige tabel"; + } + for (let i = 0; i < 256; i++) { + if (!this.tabel.includes(i)) { + throw "AssertionError: ongeldige tabel"; + } + } + this.combineer = combineer || ((h, v) => (h + v) % 256); + } + + hash(s) { + // TODO: IMPLEMENT THIS FUNCTION + } +} diff --git a/lib/src/test/fixtures/javascript/boilerplate_copy.js b/lib/src/test/fixtures/javascript/boilerplate_copy.js new file mode 100644 index 000000000..e03e7023e --- /dev/null +++ b/lib/src/test/fixtures/javascript/boilerplate_copy.js @@ -0,0 +1,18 @@ +class Pearson { + constructor(tabel, combineer) { + this.tabel = tabel || [...new Array(256).keys()]; + if (this.tabel.length !== 256) { + throw "AssertionError: ongeldige tabel"; + } + for (let i = 0; i < 256; i++) { + if (!this.tabel.includes(i)) { + throw "AssertionError: ongeldige tabel"; + } + } + this.combineer = combineer || ((h, v) => (h + v) % 256); + } + + hash(s) { + // TODO: IMPLEMENT THIS FUNCTION + } +} diff --git a/lib/src/test/fixtures/javascript/implementation-alternative-similar.js b/lib/src/test/fixtures/javascript/implementation-alternative-similar.js new file mode 100644 index 000000000..830f38286 --- /dev/null +++ b/lib/src/test/fixtures/javascript/implementation-alternative-similar.js @@ -0,0 +1,30 @@ +class Pearson { + constructor(tabel, combineer) { + this.tabel = tabel || [...new Array(256).keys()]; + if (this.tabel.length !== 256) { + throw "AssertionError: ongeldige tabel"; + } + for (let i = 0; i < 256; i++) { + if (!this.tabel.includes(i)) { + throw "AssertionError: ongeldige tabel"; + } + } + this.combineer = combineer || ((h, v) => (h + v) % 256); + } + + hash(s) { + // TODO: implement this function + let r = 256; + for (let i = 0; i < 256; i++) { + r -=1; + } + let a = r, b = 0; + while(b < s.length) { + const c = s[b].charCodeAt(0); + const x = this.combineer(c, a); + a = this.tabel[x]; + b += 1; + } + return a; + } +} diff --git a/lib/src/test/fixtures/javascript/implementation-alternative.js b/lib/src/test/fixtures/javascript/implementation-alternative.js new file mode 100644 index 000000000..449d17895 --- /dev/null +++ b/lib/src/test/fixtures/javascript/implementation-alternative.js @@ -0,0 +1,25 @@ +class Pearson { + constructor(tabel, combineer) { + this.tabel = tabel || [...new Array(256).keys()]; + if (this.tabel.length !== 256) { + throw "AssertionError: ongeldige tabel"; + } + for (let i = 0; i < 256; i++) { + if (!this.tabel.includes(i)) { + throw "AssertionError: ongeldige tabel"; + } + } + this.combineer = combineer || ((h, v) => (h + v) % 256); + } + + hash(s) { + let h = 0, i = 0; + while(i < s.length) { + const c = s[i].charCodeAt(0); + const x = this.combineer(c, h); + h = this.tabel[x]; + i += 1; + } + return h; + } +} diff --git a/lib/src/test/fixtures/javascript/implementation-unique.js b/lib/src/test/fixtures/javascript/implementation-unique.js new file mode 100644 index 000000000..7923a8095 --- /dev/null +++ b/lib/src/test/fixtures/javascript/implementation-unique.js @@ -0,0 +1,22 @@ +class Pearson { + constructor(tabel, combineer) { + this.tabel = tabel || [...new Array(256).keys()]; + if (this.tabel.length !== 256) { + throw "AssertionError: ongeldige tabel"; + } + for (let i = 0; i < 256; i++) { + if (!this.tabel.includes(i)) { + throw "AssertionError: ongeldige tabel"; + } + } + this.combineer = combineer || ((h, v) => (h + v) % 256); + } + + hash(s) { + let h = 0; + for (let c of s) { + h = this.tabel[this.combineer(h, c.charCodeAt(0))]; + } + return h; + } +} diff --git a/web/src/api/models/file.ts b/web/src/api/models/file.ts index 0fe5df229..9d83c67bb 100644 --- a/web/src/api/models/file.ts +++ b/web/src/api/models/file.ts @@ -9,6 +9,7 @@ export interface FileIndeterminate { ast: string[] | string; mapping: Selection[] | string; amountOfKgrams: number; + ignored: boolean; label: Label; extra: { timestamp?: Date; diff --git a/web/src/api/models/kgram.ts b/web/src/api/models/kgram.ts index 4465ed1ac..24b0be695 100644 --- a/web/src/api/models/kgram.ts +++ b/web/src/api/models/kgram.ts @@ -2,6 +2,7 @@ import { Hash, File } from "@/api/models"; export interface Kgram { id: number; + ignored: boolean; hash: Hash; data: string; files: File[]; diff --git a/web/src/api/stores/file.store.ts b/web/src/api/stores/file.store.ts index 52cc49f6d..bfa3dd0fa 100644 --- a/web/src/api/stores/file.store.ts +++ b/web/src/api/stores/file.store.ts @@ -21,6 +21,7 @@ export const useFileStore = defineStore("file", () => { // State const hydrated = shallowRef(false); + const ignoredFile = shallowRef(); const filesById = shallowRef([]); const filesList = computed(() => Object.values(filesById.value)); @@ -107,6 +108,7 @@ export const useFileStore = defineStore("file", () => { const parsed = parse(await fetch()); filesById.value = parsed.files; filesActiveById.value = parsed.files; + ignoredFile.value = parsed.ignoredFile; legend.value = parsed.labels; hasLabels.value = parsed.hasLabels; hasUnlabeled.value = parsed.hasUnlabeled; @@ -122,6 +124,7 @@ export const useFileStore = defineStore("file", () => { // Parse the files from a CSV string. function parse(fileData: any[]): { + ignoredFile?: File; files: File[]; labels: Legend; hasLabels: boolean; @@ -145,6 +148,7 @@ export const useFileStore = defineStore("file", () => { const files: File[] = []; const labels: Legend = {}; + let ignoredFile; let hasLabels = false; let hasUnlabeled = false; let hasTimestamps = false; @@ -156,6 +160,7 @@ export const useFileStore = defineStore("file", () => { const extra = JSON.parse(row.extra || "{}"); extra.timestamp = extra.createdAt && new Date(extra.createdAt); hasTimestamps = hasTimestamps || !!extra.timestamp; + file.ignored = row.ignored == "true" file.extra = extra; file.ast = JSON.parse(row.ast); file.mapping = JSON.parse(row.mapping); @@ -203,7 +208,11 @@ export const useFileStore = defineStore("file", () => { labels: extra.labels, }; - files[file.id] = file; + if (!file.ignored) { + files[file.id] = file; + } else { + ignoredFile = file; + } } // Find the common path in the files. @@ -230,7 +239,7 @@ export const useFileStore = defineStore("file", () => { labels[defaultLabel.name] = defaultLabel; } - return { files, labels, hasLabels, hasUnlabeled, hasTimestamps }; + return { files, ignoredFile, labels, hasLabels, hasUnlabeled, hasTimestamps }; } // Anonymize the data. @@ -290,6 +299,7 @@ export const useFileStore = defineStore("file", () => { ); return { + ignoredFile, filesById, filesList, filesActiveById, diff --git a/web/src/api/stores/kgram.store.ts b/web/src/api/stores/kgram.store.ts index 8b99977dc..822406370 100644 --- a/web/src/api/stores/kgram.store.ts +++ b/web/src/api/stores/kgram.store.ts @@ -10,6 +10,7 @@ import { useFileStore, useApiStore } from "@/api/stores"; export const useKgramStore = defineStore("kgrams", () => { // List of k-grams. const kgrams = shallowRef([]); + const ignoredKgrams = shallowRef([]); // If this store has been hydrated. const hydrated = shallowRef(false); @@ -18,20 +19,34 @@ export const useKgramStore = defineStore("kgrams", () => { function parse( kgramData: any[], filesById: File[] - ): Kgram[] { + ): { + kgrams: Kgram[], + ignored: Kgram[] + } { const kgrams: Kgram[] = []; + const ignored: Kgram[] = []; for (const row of kgramData) { const id = parseInt(row.id); const fileIds: number[] = JSON.parse(row.files); const files: File[] = fileIds.map((id) => filesById[id]); - kgrams[id] = { + const kgram = { id, hash: parseInt(row.hash), + ignored: row.ignored == "true", data: row.data, files, }; + if (kgram.ignored) { + ignored.push(kgram); + } else { + kgrams[id] = kgram; + } + } - return kgrams; + return { + kgrams, + ignored + }; } // Reference to other stores. @@ -53,12 +68,15 @@ export const useKgramStore = defineStore("kgrams", () => { ); } - kgrams.value = parse(await fetch(), fileStore.filesById); + const parsed = parse(await fetch(), fileStore.filesById); + kgrams.value = parsed.kgrams; + ignoredKgrams.value = parsed.ignored; hydrated.value = true; } return { kgrams, + ignoredKgrams, hydrated, hydrate, }; diff --git a/web/src/api/stores/pair.store.ts b/web/src/api/stores/pair.store.ts index 323680727..f1e497a2e 100644 --- a/web/src/api/stores/pair.store.ts +++ b/web/src/api/stores/pair.store.ts @@ -113,9 +113,11 @@ export const usePairStore = defineStore("pairs", () => { // Populate the fragments for a given pair. async function populateFragments(pair: Pair): Promise { const customOptions = metadataStore.metadata; - const kmers = kgramStore.kgrams; + const kgrams = kgramStore.kgrams; + const ignoredKgrams = kgramStore.ignoredKgrams; + const ignoredFile = fileStore.ignoredFile; - const pairWithFragments = await dataWorker.populateFragments(pair, customOptions, kmers); + const pairWithFragments = await dataWorker.populateFragments(pair, customOptions, kgrams, ignoredKgrams, ignoredFile); pairsById.value[pair.id] = pairWithFragments; return pairWithFragments; } diff --git a/web/src/api/workers/data.worker.ts b/web/src/api/workers/data.worker.ts index 80a9fe0ed..76e1a30c9 100644 --- a/web/src/api/workers/data.worker.ts +++ b/web/src/api/workers/data.worker.ts @@ -1,5 +1,6 @@ import { fileToTokenizedFile } from "@/api/utils"; import { + File, Pair, Kgram, Metadata, @@ -40,7 +41,9 @@ export function parseFragments( export function populateFragments( pair: Pair, metadata: Metadata, - kgrams: Kgram[] + kgrams: Kgram[], + ignoredKgrams: Kgram[], + ignoredFile?: File, ): Pair { const customOptions = metadata; const kmers = kgrams; @@ -49,6 +52,11 @@ export function populateFragments( const leftFile = fileToTokenizedFile(pair.leftFile); const rightFile = fileToTokenizedFile(pair.rightFile); index.addFiles([leftFile, rightFile]); + if (ignoredFile) { + const ignored = fileToTokenizedFile(ignoredFile); + index.addIgnoredFile(ignored); + } + index.addIgnoredHashes(ignoredKgrams.map(k => k.hash)); const reportPair = index.getPair(leftFile, rightFile); const kmersMap: Map = new Map(); diff --git a/web/src/composables/useMonacoEditorWorkers.ts b/web/src/composables/useMonacoEditorWorkers.ts index 4c668a725..5d9e15a09 100644 --- a/web/src/composables/useMonacoEditorWorkers.ts +++ b/web/src/composables/useMonacoEditorWorkers.ts @@ -4,10 +4,7 @@ import EditorWorker from 'monaco-editor/esm/vs/editor/editor.worker?worker' export function useMonacoEditorWorkers() { onMounted(() => { - self.MonacoEnvironment = { - getWorker() { - return new EditorWorker() - } - } + self.MonacoEnvironment ||= {}; + self.MonacoEnvironment.getWorker = () => new EditorWorker(); }); }