-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[#64] Copy paste detection in lists #102
base: master
Are you sure you want to change the base?
Changes from all commits
317e3a6
e16377b
681c6b1
dfda710
f60aece
94748c5
034bef5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,6 +15,8 @@ import Control.Lens (makeLenses) | |
import Data.Aeson (FromJSON (..), withText) | ||
import Data.Char (isAlphaNum) | ||
import Data.Char qualified as C | ||
import Data.DList (DList) | ||
import Data.DList qualified as DList | ||
import Data.Default (Default (..)) | ||
import Data.List qualified as L | ||
import Data.Map qualified as M | ||
|
@@ -26,8 +28,6 @@ import Text.Numeral.Roman (toRoman) | |
|
||
import Xrefcheck.Progress | ||
import Xrefcheck.Util | ||
import Data.DList (DList) | ||
import Data.DList qualified as DList | ||
|
||
----------------------------------------------------------- | ||
-- Types | ||
|
@@ -40,15 +40,10 @@ import Data.DList qualified as DList | |
data Flavor | ||
= GitHub | ||
| GitLab | ||
deriving stock (Show) | ||
deriving stock (Show, Enum, Bounded) | ||
|
||
allFlavors :: [Flavor] | ||
allFlavors = [GitHub, GitLab] | ||
where | ||
_exhaustivenessCheck = \case | ||
GitHub -> () | ||
GitLab -> () | ||
-- if you update this, also update the list above | ||
allFlavors = [minBound.. maxBound] | ||
|
||
instance FromJSON Flavor where | ||
parseJSON = withText "flavor" $ \txt -> | ||
|
@@ -62,6 +57,7 @@ instance FromJSON Flavor where | |
-- representation of this thing, and it actually appears in reports only. | ||
newtype Position = Position (Maybe Text) | ||
deriving stock (Show, Eq, Generic) | ||
deriving anyclass NFData | ||
|
||
instance Buildable Position where | ||
build (Position pos) = case pos of | ||
|
@@ -77,7 +73,9 @@ data Reference = Reference | |
, rAnchor :: Maybe Text | ||
-- ^ Section or custom anchor tag. | ||
, rPos :: Position | ||
} deriving stock (Show, Generic) | ||
} | ||
deriving stock (Show, Generic) | ||
deriving anyclass NFData | ||
|
||
-- | Context of anchor. | ||
data AnchorType | ||
|
@@ -88,35 +86,51 @@ data AnchorType | |
| BiblioAnchor | ||
-- ^ Id of entry in bibliography | ||
deriving stock (Show, Eq, Generic) | ||
deriving anyclass NFData | ||
|
||
-- | A referable anchor. | ||
data Anchor = Anchor | ||
{ aType :: AnchorType | ||
, aName :: Text | ||
, aPos :: Position | ||
} deriving stock (Show, Eq, Generic) | ||
} | ||
deriving stock (Show, Eq, Generic) | ||
deriving anyclass NFData | ||
|
||
data CopyPaste = CopyPaste | ||
{ cpAnchorText :: Text | ||
, cpPlainText :: Text | ||
, cpPosition :: Position | ||
} | ||
deriving stock (Show, Eq, Generic) | ||
deriving anyclass NFData | ||
|
||
data FileInfoDiff = FileInfoDiff | ||
{ _fidReferences :: DList Reference | ||
, _fidAnchors :: DList Anchor | ||
, _fidCopyPastes :: DList CopyPaste | ||
} | ||
makeLenses ''FileInfoDiff | ||
|
||
diffToFileInfo :: FileInfoDiff -> FileInfo | ||
diffToFileInfo (FileInfoDiff refs anchors) = | ||
FileInfo (DList.toList refs) (DList.toList anchors) | ||
diffToFileInfo (FileInfoDiff refs anchors pastas) = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: I believe here should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🍝 |
||
FileInfo (DList.toList refs) (DList.toList anchors) (DList.toList pastas) | ||
|
||
instance Semigroup FileInfoDiff where | ||
FileInfoDiff a b <> FileInfoDiff c d = FileInfoDiff (a <> c) (b <> d) | ||
FileInfoDiff a b e <> FileInfoDiff c d f = FileInfoDiff (a <> c) (b <> d) (e <> f) | ||
|
||
instance Monoid FileInfoDiff where | ||
mempty = FileInfoDiff mempty mempty | ||
mempty = FileInfoDiff mempty mempty mempty | ||
|
||
-- | All information regarding a single file we care about. | ||
data FileInfo = FileInfo | ||
{ _fiReferences :: [Reference] | ||
, _fiAnchors :: [Anchor] | ||
} deriving stock (Show, Generic) | ||
, _fiCopyPastes :: [CopyPaste] | ||
} | ||
deriving stock (Show, Generic) | ||
deriving anyclass NFData | ||
|
||
makeLenses ''FileInfo | ||
|
||
instance Default FileInfo where | ||
|
@@ -129,12 +143,6 @@ newtype RepoInfo = RepoInfo (Map FilePath FileInfo) | |
-- Instances | ||
----------------------------------------------------------- | ||
|
||
instance NFData Position | ||
instance NFData Reference | ||
instance NFData AnchorType | ||
instance NFData Anchor | ||
instance NFData FileInfo | ||
|
||
instance Buildable Reference where | ||
build Reference{..} = | ||
nameF ("reference " +| paren (build loc) |+ " " +| rPos |+ "") $ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,11 +18,14 @@ module Xrefcheck.Scanners.Markdown | |
import Universum | ||
|
||
import CMarkGFM (Node (..), NodeType (..), PosInfo (..), commonmarkToNode) | ||
import Control.Lens hiding ((^?)) | ||
import Control.Monad.Except (MonadError, throwError) | ||
import Data.Aeson.TH (deriveFromJSON) | ||
import Data.ByteString.Lazy qualified as BSL | ||
import Data.Char (isAlpha) | ||
import Data.DList qualified as DList | ||
import Data.Default (def) | ||
import Data.List (isSubsequenceOf) | ||
import Data.Text qualified as T | ||
import Data.Text.Lazy qualified as LT | ||
import Fmt (Buildable (..), blockListF, nameF, (+|), (|+)) | ||
|
@@ -37,6 +40,8 @@ data MarkdownConfig = MarkdownConfig | |
|
||
deriveFromJSON aesonConfigOption ''MarkdownConfig | ||
|
||
makePrisms ''NodeType | ||
|
||
defGithubMdConfig :: MarkdownConfig | ||
defGithubMdConfig = MarkdownConfig | ||
{ mcFlavor = GitHub | ||
|
@@ -130,7 +135,7 @@ nodeExtractInfo input@(Node _ _ nSubs) = do | |
else case removeIgnored input of | ||
Left err -> throwError err | ||
Right relevant -> | ||
diffToFileInfo <$> foldNode extractor relevant | ||
diffToFileInfo <$> foldNode (merge [extractor, copyPaste]) relevant | ||
|
||
where | ||
extractor :: Node -> m FileInfoDiff | ||
|
@@ -144,17 +149,19 @@ nodeExtractInfo input@(Node _ _ nSubs) = do | |
let aType = HeaderAnchor lvl | ||
let aName = headerToAnchor flavor $ nodeExtractText node | ||
let aPos = toPosition pos | ||
return $ FileInfoDiff DList.empty $ DList.singleton $ Anchor {aType, aName, aPos} | ||
return mempty | ||
{ _fidAnchors = DList.singleton $ Anchor {aType, aName, aPos} | ||
} | ||
|
||
HTML_INLINE text -> do | ||
let mName = T.stripSuffix "\">" =<< T.stripPrefix "<a name=\"" text | ||
case mName of | ||
Just aName -> do | ||
let aType = HandAnchor | ||
aPos = toPosition pos | ||
return $ FileInfoDiff | ||
mempty | ||
(pure $ Anchor {aType, aName, aPos}) | ||
return mempty | ||
{ _fidAnchors = DList.singleton $ Anchor {aType, aName, aPos} | ||
} | ||
|
||
Nothing -> do | ||
return mempty | ||
|
@@ -167,12 +174,46 @@ nodeExtractInfo input@(Node _ _ nSubs) = do | |
[t] -> (t, Nothing) | ||
t : ts -> (t, Just $ T.intercalate "#" ts) | ||
[] -> error "impossible" | ||
return $ FileInfoDiff | ||
(DList.singleton $ Reference {rName, rPos, rLink, rAnchor}) | ||
DList.empty | ||
return mempty | ||
{ _fidReferences = DList.singleton $ Reference {rName, rPos, rLink, rAnchor} | ||
} | ||
|
||
_ -> return mempty | ||
|
||
copyPaste :: Node -> m FileInfoDiff | ||
copyPaste (Node _ (LIST _) nodes) = do | ||
case items of | ||
top : rest | urlIsASubsequence top -> do | ||
let bad = filter (not . urlIsASubsequence) rest | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to the acceptance criteria, we want to report copy-paste if However, AFAIU here you check that the first item satisfy the law --
produces
I am not sure if we need to consider such list items as bad copy-paste, I believe it would be better to don't take them into account at all and report only those which strictly satisfy the law from acceptance criteria -- |
||
pure mempty { _fidCopyPastes = DList.fromList bad } | ||
_ -> do | ||
pure mempty | ||
where | ||
items = do | ||
(_, nodes', _) <- takeOnly _ITEM nodes | ||
(_, nodes'', _) <- takeOnly _PARAGRAPH nodes' | ||
take 1 $ do | ||
(_, texts, (url, _)) <- takeOnly _LINK nodes'' | ||
(pos, _, txt) <- take 1 $ takeOnly _TEXT texts | ||
return (CopyPaste url txt (toPosition pos)) | ||
|
||
copyPaste _ = pure mempty | ||
|
||
takeOnly prizm list = do | ||
Node pos hdr nodes <- list | ||
case hdr^?prizm of | ||
Just res -> return (pos, nodes, res) | ||
Nothing -> [] | ||
|
||
urlIsASubsequence :: CopyPaste -> Bool | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: AFAICS, here we check if link text is a subsequence of link URL, so if name is a URL subsequence, not URL is a subsequence of name |
||
urlIsASubsequence paste = | ||
gist (cpAnchorText paste) `isSubsequenceOf` gist (cpPlainText paste) | ||
where | ||
gist = T.unpack . T.toLower . T.filter isAlpha | ||
|
||
merge :: (Monad m, Monoid b) => [a -> m b] -> a -> m b | ||
merge fs a = mconcat <$> traverse ($ a) fs | ||
|
||
checkIgnoreFile :: [Node] -> Bool | ||
checkIgnoreFile nodes = | ||
let isSimpleComment :: Node -> Bool | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -27,6 +27,7 @@ import Universum | |||||
import Control.Concurrent.Async (wait, withAsync) | ||||||
import Control.Exception (throwIO) | ||||||
import Control.Monad.Except (MonadError (..)) | ||||||
import Data.Bits (toIntegralSized) | ||||||
import Data.ByteString qualified as BS | ||||||
import Data.Map qualified as M | ||||||
import Data.Text qualified as T | ||||||
|
@@ -49,7 +50,6 @@ import Text.Regex.TDFA.Text (Regex, regexec) | |||||
import Text.URI (Authority (..), URI (..), mkURI) | ||||||
import Time (RatioNat, Second, Time (..), ms, threadDelay, timeout) | ||||||
|
||||||
import Data.Bits (toIntegralSized) | ||||||
import Xrefcheck.Config | ||||||
import Xrefcheck.Core | ||||||
import Xrefcheck.Orphans () | ||||||
|
@@ -70,9 +70,7 @@ deriving newtype instance Semigroup (VerifyResult e) | |||||
deriving newtype instance Monoid (VerifyResult e) | ||||||
|
||||||
instance Buildable e => Buildable (VerifyResult e) where | ||||||
build vr = case verifyErrors vr of | ||||||
Nothing -> "ok" | ||||||
Just errs -> listF errs | ||||||
build vr = maybe "ok" listF (verifyErrors vr) | ||||||
|
||||||
verifyOk :: VerifyResult e -> Bool | ||||||
verifyOk (VerifyResult errors) = null errors | ||||||
|
@@ -114,6 +112,7 @@ data VerifyError | |||||
| ExternalFtpException FTPException | ||||||
| FtpEntryDoesNotExist FilePath | ||||||
| ExternalResourceSomeError Text | ||||||
| PossiblyIncorrectCopyPaste Text Text | ||||||
deriving stock (Show, Eq) | ||||||
|
||||||
instance Buildable VerifyError where | ||||||
|
@@ -156,10 +155,15 @@ instance Buildable VerifyError where | |||||
"⛂ FTP exception (" +| err |+ ")\n" | ||||||
|
||||||
FtpEntryDoesNotExist entry -> | ||||||
"⛂ File or directory does not exist:\n" +| entry |+ "\n" | ||||||
"⛂ File or directory does not exist:\n" +| entry |+ "\n" | ||||||
|
||||||
ExternalResourceSomeError err -> | ||||||
"⛂ " +| build err |+ "\n\n" | ||||||
|
||||||
PossiblyIncorrectCopyPaste url text -> | ||||||
"⛂ Possibly incorrect copy-paste in list with references\n" +| | ||||||
" the url is " +| build url |+ "\n " +| | ||||||
" but the text is " +| build text |+ "\n\n" | ||||||
where | ||||||
anchorHints = \case | ||||||
[] -> "\n" | ||||||
|
@@ -219,10 +223,19 @@ verifyRepo | |||||
|
||||||
progressRef <- newIORef $ initVerifyProgress (map snd toScan) | ||||||
|
||||||
errorss <- for (M.toList repoInfo) $ \(file, info) -> do | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here and below
Suggested change
|
||||||
let pasta = _fiCopyPastes info | ||||||
return | ||||||
$ VerifyResult | ||||||
$ fmap (\(CopyPaste url txt pos) -> | ||||||
WithReferenceLoc file (Reference "" "" Nothing pos) | ||||||
$ PossiblyIncorrectCopyPaste url txt) | ||||||
pasta | ||||||
|
||||||
accumulated <- withAsync (printer progressRef) $ \_ -> | ||||||
forConcurrentlyCaching toScan ifExternalThenCache $ \(file, ref) -> | ||||||
verifyReference config mode progressRef repoInfo' root file ref | ||||||
return $ fold accumulated | ||||||
return $ fold errorss <> fold accumulated | ||||||
where | ||||||
printer progressRef = forever $ do | ||||||
readIORef progressRef >>= reprintAnalyseProgress rw mode | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
{- SPDX-FileCopyrightText: 2019 Serokell <https://serokell.io> | ||
- | ||
- SPDX-License-Identifier: MPL-2.0 | ||
-} | ||
|
||
module Test.Xrefcheck.CopyPasteInListsSpec where | ||
|
||
import Universum | ||
|
||
import Test.Hspec (Spec, describe, it, shouldBe) | ||
|
||
import Test.Xrefcheck.Util | ||
import Xrefcheck.Core | ||
|
||
spec :: Spec | ||
spec = do | ||
describe "Possibly incorrect copy-paste" $ do | ||
for_ allFlavors $ \fl -> do | ||
it ("is detected (" <> show fl <> ")") $ do | ||
fi <- getFI fl "tests/markdowns/without-annotations/copy-paste_in_lists.md" | ||
getPasta fi `shouldBe`[("foo-bar","Foo Kek")] | ||
where | ||
getPasta :: FileInfo -> [(Text, Text)] | ||
getPasta fi = map (cpAnchorText &&& cpPlainText) $ fi ^. fiCopyPastes |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
<!-- | ||
- SPDX-FileCopyrightText: 2018-2019 Serokell <https://serokell.io> | ||
- | ||
- SPDX-License-Identifier: MPL-2.0 | ||
--> | ||
|
||
A list with bad copy-paste: | ||
|
||
- [Foo Bar](foo-bar) e | ||
- [Foo Qux](foo-qux) e | ||
- [Foo Kek](foo-bar) e | ||
|
||
A list that is completely fine: | ||
|
||
- [Foo Bar](foo-bar) e | ||
- [Foo Qux](foo-qux) e | ||
- [Foo Kek](foo-kek) e |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I feel like
AnchorText
andPlainText
sound confusing. The issue doesn't focus on anchors, but all type of links, external and internal. Perhaps, more genericLinkText
andURLText
would be better?UPD: Okay, I'm not sure if all the links were intended to check, as there is only example with files in issue description. But it's also not clear that this feature is for files only