Skip to content

Commit

Permalink
mapreduce| Adds map/reduce functionality to SimilarChecker
Browse files Browse the repository at this point in the history
Before adding a new mixin this proves the concept works, adding tests as
examples of how this would work in the main linter.

The idea here is that, because `check_parallel()` uses a multiprocess
`map` function, that the natural follow on is to use a 'reduce`
paradigm. This should demonstrate that.
  • Loading branch information
doublethefish committed Apr 23, 2020
1 parent 6f37c91 commit d1bbd4c
Show file tree
Hide file tree
Showing 4 changed files with 268 additions and 1 deletion.
30 changes: 29 additions & 1 deletion pylint/checkers/similar.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,20 @@ def _iter_sims(self):
for lineset2 in self.linesets[idx + 1 :]:
yield from self._find_common(lineset, lineset2)

def get_map_data(self):
""" Returns the data we can use for a map/reduce process
In this case we are returning this instance's Linesets, that is all file
information that will later be used for vectorisation.
"""
return self.linesets

def combine_mapreduce_data(self, linesets_collection):
""" Reduces and recombines data into a format that we can report on
The partner function of get_map_data() """
self.linesets = [line for lineset in linesets_collection for line in lineset]


def stripped_lines(lines, ignore_comments, ignore_docstrings, ignore_imports):
"""return lines with leading/trailing whitespace and any ignored code
Expand Down Expand Up @@ -341,7 +355,7 @@ def __init__(self, linter=None):
def set_option(self, optname, value, action=None, optdict=None):
"""method called to set an option (registered in the options list)
overridden to report options setting to Similar
Overridden to report options setting to Similar
"""
BaseChecker.set_option(self, optname, value, action, optdict)
if optname == "min-similarity-lines":
Expand Down Expand Up @@ -391,6 +405,20 @@ def close(self):
stats["nb_duplicated_lines"] = duplicated
stats["percent_duplicated_lines"] = total and duplicated * 100.0 / total

def get_map_data(self):
""" Passthru override """
return Similar.get_map_data(self)

@classmethod
def reduce_map_data(cls, linter, data):
""" Reduces and recombines data into a format that we can report on
The partner function of get_map_data() """
recombined = SimilarChecker(linter)
recombined.open()
Similar.combine_mapreduce_data(recombined, linesets_collection=data)
recombined.close()


def register(linter):
"""required method to auto register this checker """
Expand Down
63 changes: 63 additions & 0 deletions tests/input/similar_lines_a.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
""" A file designed to have lines of similarity when compared to similar_lines_b
We use lorm-ipsum to generate 'random' code. """
# Copyright (c) 2020 Frank Harrison <[email protected]>


def adipiscing(elit):
etiam = "id"
dictum = "purus,"
vitae = "pretium"
neque = "Vivamus"
nec = "ornare"
tortor = "sit"
return etiam, dictum, vitae, neque, nec, tortor


class Amet:
def similar_function_3_lines(self, tellus): # line same #1
agittis = 10 # line same #2
tellus *= 300 # line same #3
return agittis, tellus # line diff

def lorem(self, ipsum):
dolor = "sit"
amet = "consectetur"
return (lorem, dolor, amet)

def similar_function_5_lines(self, similar): # line same #1
some_var = 10 # line same #2
someother_var *= 300 # line same #3
fusce = "sit" # line same #4
amet = "tortor" # line same #5
return some_var, someother_var, fusce, amet # line diff

def __init__(self, moleskie, lectus="Mauris", ac="pellentesque"):
metus = "ut"
lobortis = "urna."
Integer = "nisl"
(mauris,) = "interdum"
non = "odio"
semper = "aliquam"
malesuada = "nunc."
iaculis = "dolor"
facilisis = "ultrices"
vitae = "ut."

return (
metus,
lobortis,
Integer,
mauris,
non,
semper,
malesuada,
iaculis,
facilisis,
vitae,
)

def similar_function_3_lines(self, tellus): # line same #1
agittis = 10 # line same #2
tellus *= 300 # line same #3
return agittis, tellus # line diff
36 changes: 36 additions & 0 deletions tests/input/similar_lines_b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
""" The sister file of similar_lines_a, another file designed to have lines of
similarity when compared to its sister file
As with the sister file, we use lorm-ipsum to generate 'random' code. """
# Copyright (c) 2020 Frank Harrison <[email protected]>


class Nulla:
tortor = "ultrices quis porta in"
sagittis = "ut tellus"

def pulvinar(self, blandit, metus):
egestas = [mauris for mauris in zip(blandit, metus)]
neque = (egestas, blandit)

def similar_function_5_lines(self, similar): # line same #1
some_var = 10 # line same #2
someother_var *= 300 # line same #3
fusce = "sit" # line same #4
amet = "tortor" # line same #5
iaculis = "dolor" # line diff
return some_var, someother_var, fusce, amet, iaculis, iaculis # line diff


def tortor(self):
ultrices = 2
quis = ultricies * "porta"
return ultricies, quis


class Commodo:
def similar_function_3_lines(self, tellus): # line same #1
agittis = 10 # line same #2
tellus *= 300 # line same #3
laoreet = "commodo " # line diff
return agittis, tellus, laoreet # line diff
140 changes: 140 additions & 0 deletions tests/unittest_checker_similar.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import pytest

from pylint.checkers import similar
from pylint.lint import PyLinter
from pylint.testutils import TestReporter as Reporter

SIMILAR1 = join(dirname(abspath(__file__)), "input", "similar1")
SIMILAR2 = join(dirname(abspath(__file__)), "input", "similar2")
Expand Down Expand Up @@ -195,3 +197,141 @@ def test_no_args():
assert ex.code == 1
else:
pytest.fail("not system exit")


def test_get_map_data():
""" Tests that a SimilarChecker respects the MapReduceMixin interface
"""
linter = PyLinter(reporter=Reporter())

# Add a parallel checker to ensure it can map and reduce
linter.register_checker(similar.SimilarChecker(linter))

source_streams = (
join(dirname(abspath(__file__)), "input", "similar_lines_a.py"),
join(dirname(abspath(__file__)), "input", "similar_lines_b.py"),
)
expected_linelists = (
(
"",
"",
"",
"",
"",
"",
"def adipiscing(elit):",
'etiam = "id"',
'dictum = "purus,"',
'vitae = "pretium"',
'neque = "Vivamus"',
'nec = "ornare"',
'tortor = "sit"',
"return etiam, dictum, vitae, neque, nec, tortor",
"",
"",
"class Amet:",
"def similar_function_3_lines(self, tellus):",
"agittis = 10",
"tellus *= 300",
"return agittis, tellus",
"",
"def lorem(self, ipsum):",
'dolor = "sit"',
'amet = "consectetur"',
"return (lorem, dolor, amet)",
"",
"def similar_function_5_lines(self, similar):",
"some_var = 10",
"someother_var *= 300",
'fusce = "sit"',
'amet = "tortor"',
"return some_var, someother_var, fusce, amet",
"",
'def __init__(self, moleskie, lectus="Mauris", ac="pellentesque"):',
'metus = "ut"',
'lobortis = "urna."',
'Integer = "nisl"',
'(mauris,) = "interdum"',
'non = "odio"',
'semper = "aliquam"',
'malesuada = "nunc."',
'iaculis = "dolor"',
'facilisis = "ultrices"',
'vitae = "ut."',
"",
"return (",
"metus,",
"lobortis,",
"Integer,",
"mauris,",
"non,",
"semper,",
"malesuada,",
"iaculis,",
"facilisis,",
"vitae,",
")",
"",
"def similar_function_3_lines(self, tellus):",
"agittis = 10",
"tellus *= 300",
"return agittis, tellus",
),
(
"",
"",
"",
"",
"",
"",
"",
"class Nulla:",
'tortor = "ultrices quis porta in"',
'sagittis = "ut tellus"',
"",
"def pulvinar(self, blandit, metus):",
"egestas = [mauris for mauris in zip(blandit, metus)]",
"neque = (egestas, blandit)",
"",
"def similar_function_5_lines(self, similar):",
"some_var = 10",
"someother_var *= 300",
'fusce = "sit"',
'amet = "tortor"',
'iaculis = "dolor"',
"return some_var, someother_var, fusce, amet, iaculis, iaculis",
"",
"",
"def tortor(self):",
"ultrices = 2",
'quis = ultricies * "porta"',
"return ultricies, quis",
"",
"",
"class Commodo:",
"def similar_function_3_lines(self, tellus):",
"agittis = 10",
"tellus *= 300",
'laoreet = "commodo "',
"return agittis, tellus, laoreet",
),
)

data = []

# Manually perform a 'map' type function
for source_fname in source_streams:
sim = similar.SimilarChecker(linter)
with open(source_fname) as stream:
sim.append_stream(source_fname, stream)
# The map bit, can you tell? ;)
data.extend(sim.get_map_data())

assert len(expected_linelists) == len(data)
for source_fname, expected_lines, lineset_obj in zip(
source_streams, expected_linelists, data
):
assert source_fname == lineset_obj.name
# There doesn't seem to be a faster way of doing this, yet.
lines = (line for idx, line in lineset_obj.enumerate_stripped())
assert tuple(expected_lines) == tuple(lines)

0 comments on commit d1bbd4c

Please sign in to comment.