Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Greedy String Tiling out of bounds #2179

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 9 additions & 31 deletions core/src/main/java/de/jplag/GreedyStringTiling.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.stream.Collectors;

import de.jplag.options.JPlagOptions;
Expand All @@ -22,15 +20,14 @@
* String Similarity via Greedy String Tiling and Running Karp−Rabin Matching </a>
*/
public class GreedyStringTiling {

private final int minimumMatchLength;
private final JPlagOptions options;
private final ConcurrentMap<TokenType, Integer> tokenTypeValues;
private final Map<Submission, Set<Token>> baseCodeMarkings = new IdentityHashMap<>();

private final Map<Submission, int[]> cachedTokenValueLists = new IdentityHashMap<>();
private final Map<Submission, SubsequenceHashLookupTable> cachedHashLookupTables = new IdentityHashMap<>();

private final TokenValueMapper tokenValueMapper;

private static final String ERROR_INDEX_OUT_OF_BOUNDS = """
GST index out of bounds. This is probably a random issue caused by multithreading issues.
Length of the list that caused the exception (the list of marks for the relevant submission): %s, Index in that list: %s
Expand All @@ -40,13 +37,13 @@ Length of the list that caused the exception (the list of marks for the relevant
Submission (other): %s
""".trim().stripIndent();

public GreedyStringTiling(JPlagOptions options) {
public GreedyStringTiling(JPlagOptions options, TokenValueMapper tokenValueMapper) {
this.options = options;
// Ensures 1 <= neighborLength <= minimumTokenMatch
int minimumNeighborLength = Math.min(Math.max(options.mergingOptions().minimumNeighborLength(), 1), options.minimumTokenMatch());
this.minimumMatchLength = options.mergingOptions().enabled() ? minimumNeighborLength : options.minimumTokenMatch();
this.tokenTypeValues = new ConcurrentHashMap<>();
this.tokenTypeValues.put(SharedTokenType.FILE_END, 0);

this.tokenValueMapper = tokenValueMapper;
}

/**
Expand Down Expand Up @@ -108,8 +105,8 @@ public final JPlagComparison compare(Submission firstSubmission, Submission seco
* @return the comparison results.
*/
private JPlagComparison compareInternal(Submission leftSubmission, Submission rightSubmission) {
int[] leftValues = tokenValueListFromSubmission(leftSubmission);
int[] rightValues = tokenValueListFromSubmission(rightSubmission);
int[] leftValues = this.tokenValueMapper.getTokenValuesFor(leftSubmission);
int[] rightValues = this.tokenValueMapper.getTokenValuesFor(rightSubmission);

boolean[] leftMarked = calculateInitiallyMarked(leftSubmission);
boolean[] rightMarked = calculateInitiallyMarked(rightSubmission);
Expand Down Expand Up @@ -219,33 +216,14 @@ private boolean[] calculateInitiallyMarked(Submission submission) {

private SubsequenceHashLookupTable subsequenceHashLookupTableForSubmission(Submission submission, boolean[] marked) {
return cachedHashLookupTables.computeIfAbsent(submission,
(key -> new SubsequenceHashLookupTable(minimumMatchLength, tokenValueListFromSubmission(key), marked)));
}

/**
* Converts the tokens of the submission to a list of values.
* @param submission The submission from which to convert the tokens.
*/
private int[] tokenValueListFromSubmission(Submission submission) {
return cachedTokenValueLists.computeIfAbsent(submission, (key -> {
List<Token> tokens = key.getTokenList();
int[] tokenValueList = new int[tokens.size()];
for (int i = 0; i < tokens.size(); i++) {
TokenType type = tokens.get(i).getType();
synchronized (tokenTypeValues) {
tokenTypeValues.putIfAbsent(type, tokenTypeValues.size());
}
tokenValueList[i] = tokenTypeValues.get(type);
}
return tokenValueList;
}));
(key -> new SubsequenceHashLookupTable(minimumMatchLength, this.tokenValueMapper.getTokenValuesFor(submission), marked)));
}

private boolean checkMark(boolean[] marks, int index, Submission submission, Submission otherSubmission) {
if (index >= marks.length) {
throw new IllegalStateException(String.format(ERROR_INDEX_OUT_OF_BOUNDS, marks.length, index, submission.getTokenList().size(),
submission.getTokenList().stream().map(it -> it.getType().getDescription()).collect(Collectors.joining(", ")),
cachedTokenValueLists.get(submission).length, submission.getName(), otherSubmission.getName()));
this.tokenValueMapper.getTokenValuesFor(submission).length, submission.getName(), otherSubmission.getName()));
}

return marks[index];
Expand Down
13 changes: 8 additions & 5 deletions core/src/main/java/de/jplag/JPlag.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ private static Version loadVersion() {

/**
* Creates and initializes a JPlag instance, parameterized by a set of options.
* @deprecated in favor of static {@link #run(JPlagOptions)}.
* @param options determines the parameterization.
* @deprecated in favor of static {@link #run(JPlagOptions)}.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did this move down due to spotless?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. I'm pretty sure I didn't touch the Javadoc. But I think this is the canonical order.

*/
@Deprecated(since = "4.3.0")
public JPlag(JPlagOptions options) {
Expand All @@ -49,9 +49,9 @@ public JPlag(JPlagOptions options) {

/**
* Main procedure, executes the comparison of source code submissions.
* @deprecated in favor of static {@link #run(JPlagOptions)}.
* @return the results of the comparison, specifically the submissions whose similarity exceeds a set threshold.
* @throws ExitException if JPlag exits preemptively.
* @deprecated in favor of static {@link #run(JPlagOptions)}.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here

*/
@Deprecated(since = "4.3.0")
public JPlagResult run() throws ExitException {
Expand All @@ -66,11 +66,14 @@ public JPlagResult run() throws ExitException {
*/
public static JPlagResult run(JPlagOptions options) throws ExitException {
checkForConfigurationConsistency(options);
GreedyStringTiling coreAlgorithm = new GreedyStringTiling(options);
ComparisonStrategy comparisonStrategy = new ParallelComparisonStrategy(options, coreAlgorithm);

// Parse and validate submissions.
SubmissionSetBuilder builder = new SubmissionSetBuilder(options);
SubmissionSet submissionSet = builder.buildSubmissionSet();

GreedyStringTiling coreAlgorithm = new GreedyStringTiling(options, TokenValueMapper.generateTokenValueMapper(submissionSet));
ComparisonStrategy comparisonStrategy = new ParallelComparisonStrategy(options, coreAlgorithm);

if (options.normalize() && options.language().supportsNormalization() && options.language().requiresCoreNormalization()) {
submissionSet.normalizeSubmissions();
}
Expand Down Expand Up @@ -113,7 +116,7 @@ private static void checkForConfigurationConsistency(JPlagOptions options) throw
}

List<String> duplicateNames = getDuplicateSubmissionFolderNames(options);
if (duplicateNames.size() > 0) {
if (!duplicateNames.isEmpty()) {
throw new RootDirectoryException(String.format("Duplicate root directory names found: %s", String.join(", ", duplicateNames)));
}
}
Expand Down
49 changes: 49 additions & 0 deletions core/src/main/java/de/jplag/TokenValueMapper.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package de.jplag;

import java.util.HashMap;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Map;

import de.jplag.logging.ProgressBarLogger;
import de.jplag.logging.ProgressBarType;

public class TokenValueMapper {
private final Map<TokenType, Integer> tokenTypeValues;
private final Map<Submission, int[]> tokenValueMap;

public TokenValueMapper() {
this.tokenTypeValues = new HashMap<>();
this.tokenValueMap = new IdentityHashMap<>();

this.tokenTypeValues.put(SharedTokenType.FILE_END, 0);
}

public void addSubmissions(SubmissionSet submissionSet) {
ProgressBarLogger.iterate(ProgressBarType.HASH_CREATION, submissionSet.getSubmissions(), this::addSingleSubmission);
}

public void addSingleSubmission(Submission submission) {
List<Token> tokens = submission.getTokenList();
int[] tokenValues = new int[tokens.size()];
for (int i = 0; i < tokens.size(); i++) {
TokenType type = tokens.get(i).getType();
tokenTypeValues.putIfAbsent(type, tokenTypeValues.size());
tokenValues[i] = tokenTypeValues.get(type);
}
this.tokenValueMap.put(submission, tokenValues);
}

public int[] getTokenValuesFor(Submission submission) {
return this.tokenValueMap.get(submission);
}

public static TokenValueMapper generateTokenValueMapper(SubmissionSet submissionSet) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a special reason to have this generator method and not use the constrcutor?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kind of. Usually constructors should not contain application logic, at least the way I learned it. So having a constructor create the entire map automatically feels like it's bad code.

TokenValueMapper tokenValueMapper = new TokenValueMapper();
tokenValueMapper.addSubmissions(submissionSet);
if (submissionSet.hasBaseCode()) {
tokenValueMapper.addSingleSubmission(submissionSet.getBaseCode());
}
return tokenValueMapper;
}
}
1 change: 1 addition & 0 deletions core/src/main/java/de/jplag/logging/ProgressBarType.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
public enum ProgressBarType {
LOADING("Loading Submissions ", false),
PARSING("Parsing Submissions ", false),
HASH_CREATION("Preparing Submissions", false),
COMPARING("Comparing Submissions", false),
MATCH_MERGING("Merging matched subsequences ", false),
TOKEN_STRING_NORMALIZATION("Normalizing Token Sequence", false),
Expand Down
7 changes: 4 additions & 3 deletions core/src/test/java/de/jplag/merging/MergingTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import de.jplag.SubmissionSetBuilder;
import de.jplag.TestBase;
import de.jplag.Token;
import de.jplag.TokenValueMapper;
import de.jplag.exceptions.ExitException;
import de.jplag.options.JPlagOptions;
import de.jplag.strategy.ComparisonStrategy;
Expand All @@ -45,11 +46,11 @@ class MergingTest extends TestBase {
MergingTest() throws ExitException {
options = getDefaultOptions("merging").withMergingOptions(new MergingOptions(true, MINIMUM_NEIGHBOR_LENGTH, MAXIMUM_GAP_SIZE));

GreedyStringTiling coreAlgorithm = new GreedyStringTiling(options);
comparisonStrategy = new ParallelComparisonStrategy(options, coreAlgorithm);

SubmissionSetBuilder builder = new SubmissionSetBuilder(options);
submissionSet = builder.buildSubmissionSet();

GreedyStringTiling coreAlgorithm = new GreedyStringTiling(options, TokenValueMapper.generateTokenValueMapper(submissionSet));
comparisonStrategy = new ParallelComparisonStrategy(options, coreAlgorithm);
}

@BeforeEach
Expand Down
Loading