From 317e3a6b4be8296b80f16cd93cd710d05e856972 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=B4=D1=80=D0=B5=D0=B5=D0=B2=20=D0=9A=D0=B8?= =?UTF-8?q?=D1=80=D0=B8=D0=BB=D0=BB?= Date: Tue, 9 Nov 2021 16:12:26 +0400 Subject: [PATCH 1/7] Convert empty instances to `derive anyclass` --- package.yaml | 1 + src/Xrefcheck/Core.hs | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/package.yaml b/package.yaml index f2b8dee7..ac620549 100644 --- a/package.yaml +++ b/package.yaml @@ -25,6 +25,7 @@ default-extensions: - ConstraintKinds - DataKinds - DefaultSignatures + - DeriveAnyClass - DeriveDataTypeable - DeriveGeneric - DerivingStrategies diff --git a/src/Xrefcheck/Core.hs b/src/Xrefcheck/Core.hs index 0ef754d0..c9f68917 100644 --- a/src/Xrefcheck/Core.hs +++ b/src/Xrefcheck/Core.hs @@ -62,6 +62,7 @@ instance FromJSON Flavor where -- representation of this thing, and it actually appears in reports only. newtype Position = Position (Maybe Text) deriving stock (Show, Eq, Generic) + deriving anyclass NFData instance Buildable Position where build (Position pos) = case pos of @@ -77,7 +78,9 @@ data Reference = Reference , rAnchor :: Maybe Text -- ^ Section or custom anchor tag. , rPos :: Position - } deriving stock (Show, Generic) + } + deriving stock (Show, Generic) + deriving anyclass NFData -- | Context of anchor. data AnchorType @@ -88,13 +91,16 @@ data AnchorType | BiblioAnchor -- ^ Id of entry in bibliography deriving stock (Show, Eq, Generic) + deriving anyclass NFData -- | A referable anchor. data Anchor = Anchor { aType :: AnchorType , aName :: Text , aPos :: Position - } deriving stock (Show, Eq, Generic) + } + deriving stock (Show, Eq, Generic) + deriving anyclass NFData data FileInfoDiff = FileInfoDiff { _fidReferences :: DList Reference @@ -116,7 +122,9 @@ instance Monoid FileInfoDiff where data FileInfo = FileInfo { _fiReferences :: [Reference] , _fiAnchors :: [Anchor] - } deriving stock (Show, Generic) + } + deriving stock (Show, Generic) + deriving anyclass NFData makeLenses ''FileInfo instance Default FileInfo where @@ -129,12 +137,6 @@ newtype RepoInfo = RepoInfo (Map FilePath FileInfo) -- Instances ----------------------------------------------------------- -instance NFData Position -instance NFData Reference -instance NFData AnchorType -instance NFData Anchor -instance NFData FileInfo - instance Buildable Reference where build Reference{..} = nameF ("reference " +| paren (build loc) |+ " " +| rPos |+ "") $ From e16377be8d740b1a41e1fc10daf94bc46b02f935 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=B4=D1=80=D0=B5=D0=B5=D0=B2=20=D0=9A=D0=B8?= =?UTF-8?q?=D1=80=D0=B8=D0=BB=D0=BB?= Date: Tue, 9 Nov 2021 16:14:51 +0400 Subject: [PATCH 2/7] Move imports of DList in Xrefcheck.Core --- src/Xrefcheck/Core.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Xrefcheck/Core.hs b/src/Xrefcheck/Core.hs index c9f68917..a831a8e6 100644 --- a/src/Xrefcheck/Core.hs +++ b/src/Xrefcheck/Core.hs @@ -15,6 +15,8 @@ import Control.Lens (makeLenses) import Data.Aeson (FromJSON (..), withText) import Data.Char (isAlphaNum) import Data.Char qualified as C +import Data.DList (DList) +import Data.DList qualified as DList import Data.Default (Default (..)) import Data.List qualified as L import Data.Map qualified as M @@ -26,8 +28,6 @@ import Text.Numeral.Roman (toRoman) import Xrefcheck.Progress import Xrefcheck.Util -import Data.DList (DList) -import Data.DList qualified as DList ----------------------------------------------------------- -- Types From 681c6b1d290661c33251fc2aa8cff0b3aa1ac31b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=B4=D1=80=D0=B5=D0=B5=D0=B2=20=D0=9A=D0=B8?= =?UTF-8?q?=D1=80=D0=B8=D0=BB=D0=BB?= Date: Tue, 9 Nov 2021 16:18:49 +0400 Subject: [PATCH 3/7] Add a type to store bad copy-paste info --- src/Xrefcheck/Core.hs | 18 ++++++++++++++---- src/Xrefcheck/Scanners/Markdown.hs | 16 +++++++++------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/src/Xrefcheck/Core.hs b/src/Xrefcheck/Core.hs index a831a8e6..4c2428b3 100644 --- a/src/Xrefcheck/Core.hs +++ b/src/Xrefcheck/Core.hs @@ -102,26 +102,36 @@ data Anchor = Anchor deriving stock (Show, Eq, Generic) deriving anyclass NFData +data CopyPaste = CopyPaste + { cpAnchorText :: Text + , cpPlainText :: Text + , cpPosition :: Position + } + deriving stock (Show, Eq, Generic) + deriving anyclass NFData + data FileInfoDiff = FileInfoDiff { _fidReferences :: DList Reference , _fidAnchors :: DList Anchor + , _fidCopyPastes :: DList CopyPaste } makeLenses ''FileInfoDiff diffToFileInfo :: FileInfoDiff -> FileInfo -diffToFileInfo (FileInfoDiff refs anchors) = - FileInfo (DList.toList refs) (DList.toList anchors) +diffToFileInfo (FileInfoDiff refs anchors pastas) = + FileInfo (DList.toList refs) (DList.toList anchors) (DList.toList pastas) instance Semigroup FileInfoDiff where - FileInfoDiff a b <> FileInfoDiff c d = FileInfoDiff (a <> c) (b <> d) + FileInfoDiff a b e <> FileInfoDiff c d f = FileInfoDiff (a <> c) (b <> d) (e <> f) instance Monoid FileInfoDiff where - mempty = FileInfoDiff mempty mempty + mempty = FileInfoDiff mempty mempty mempty -- | All information regarding a single file we care about. data FileInfo = FileInfo { _fiReferences :: [Reference] , _fiAnchors :: [Anchor] + , _fiCopyPastes :: [CopyPaste] } deriving stock (Show, Generic) deriving anyclass NFData diff --git a/src/Xrefcheck/Scanners/Markdown.hs b/src/Xrefcheck/Scanners/Markdown.hs index 9dbad32f..01458ed8 100644 --- a/src/Xrefcheck/Scanners/Markdown.hs +++ b/src/Xrefcheck/Scanners/Markdown.hs @@ -144,7 +144,9 @@ nodeExtractInfo input@(Node _ _ nSubs) = do let aType = HeaderAnchor lvl let aName = headerToAnchor flavor $ nodeExtractText node let aPos = toPosition pos - return $ FileInfoDiff DList.empty $ DList.singleton $ Anchor {aType, aName, aPos} + return mempty + { _fidAnchors = DList.singleton $ Anchor {aType, aName, aPos} + } HTML_INLINE text -> do let mName = T.stripSuffix "\">" =<< T.stripPrefix " do let aType = HandAnchor aPos = toPosition pos - return $ FileInfoDiff - mempty - (pure $ Anchor {aType, aName, aPos}) + return mempty + { _fidAnchors = DList.singleton $ Anchor {aType, aName, aPos} + } Nothing -> do return mempty @@ -167,9 +169,9 @@ nodeExtractInfo input@(Node _ _ nSubs) = do [t] -> (t, Nothing) t : ts -> (t, Just $ T.intercalate "#" ts) [] -> error "impossible" - return $ FileInfoDiff - (DList.singleton $ Reference {rName, rPos, rLink, rAnchor}) - DList.empty + return mempty + { _fidReferences = DList.singleton $ Reference {rName, rPos, rLink, rAnchor} + } _ -> return mempty From dfda710ce2c556f0777f3ebb57d2b097e6bd21b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=B4=D1=80=D0=B5=D0=B5=D0=B2=20=D0=9A=D0=B8?= =?UTF-8?q?=D1=80=D0=B8=D0=BB=D0=BB?= Date: Tue, 9 Nov 2021 16:21:03 +0400 Subject: [PATCH 4/7] Remove manual enumeration of flavours --- src/Xrefcheck/Core.hs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/Xrefcheck/Core.hs b/src/Xrefcheck/Core.hs index 4c2428b3..80ffd57a 100644 --- a/src/Xrefcheck/Core.hs +++ b/src/Xrefcheck/Core.hs @@ -40,15 +40,10 @@ import Xrefcheck.Util data Flavor = GitHub | GitLab - deriving stock (Show) + deriving stock (Show, Enum, Bounded) allFlavors :: [Flavor] -allFlavors = [GitHub, GitLab] - where - _exhaustivenessCheck = \case - GitHub -> () - GitLab -> () - -- if you update this, also update the list above +allFlavors = [minBound.. maxBound] instance FromJSON Flavor where parseJSON = withText "flavor" $ \txt -> From f60aeceb1678b9f24c78067d07705e525f67bd4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=B4=D1=80=D0=B5=D0=B5=D0=B2=20=D0=9A=D0=B8?= =?UTF-8?q?=D1=80=D0=B8=D0=BB=D0=BB?= Date: Tue, 9 Nov 2021 16:26:39 +0400 Subject: [PATCH 5/7] Add a plug for copy-paste detector --- src/Xrefcheck/Scanners/Markdown.hs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Xrefcheck/Scanners/Markdown.hs b/src/Xrefcheck/Scanners/Markdown.hs index 01458ed8..92942a06 100644 --- a/src/Xrefcheck/Scanners/Markdown.hs +++ b/src/Xrefcheck/Scanners/Markdown.hs @@ -130,7 +130,7 @@ nodeExtractInfo input@(Node _ _ nSubs) = do else case removeIgnored input of Left err -> throwError err Right relevant -> - diffToFileInfo <$> foldNode extractor relevant + diffToFileInfo <$> foldNode (merge [extractor, copyPaste]) relevant where extractor :: Node -> m FileInfoDiff @@ -175,6 +175,12 @@ nodeExtractInfo input@(Node _ _ nSubs) = do _ -> return mempty + copyPaste :: Node -> m FileInfoDiff + copyPaste _ = pure mempty + +merge :: (Monad m, Monoid b) => [a -> m b] -> a -> m b +merge fs a = mconcat <$> traverse ($ a) fs + checkIgnoreFile :: [Node] -> Bool checkIgnoreFile nodes = let isSimpleComment :: Node -> Bool From 94748c51d4d9b80318f18a0c45843d7b0f54de1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=B4=D1=80=D0=B5=D0=B5=D0=B2=20=D0=9A=D0=B8?= =?UTF-8?q?=D1=80=D0=B8=D0=BB=D0=BB?= Date: Tue, 9 Nov 2021 19:49:27 +0400 Subject: [PATCH 6/7] Add copy-paste scanner and tests --- src/Xrefcheck/Core.hs | 1 + src/Xrefcheck/Scanners/Markdown.hs | 30 +++++++++++++++++++ src/Xrefcheck/Verify.hs | 25 ++++++++++++---- tests/Test/Xrefcheck/CopyPasteInListsSpec.hs | 24 +++++++++++++++ .../copy-paste_in_lists.md | 17 +++++++++++ 5 files changed, 91 insertions(+), 6 deletions(-) create mode 100644 tests/Test/Xrefcheck/CopyPasteInListsSpec.hs create mode 100644 tests/markdowns/without-annotations/copy-paste_in_lists.md diff --git a/src/Xrefcheck/Core.hs b/src/Xrefcheck/Core.hs index 80ffd57a..ea9491a8 100644 --- a/src/Xrefcheck/Core.hs +++ b/src/Xrefcheck/Core.hs @@ -130,6 +130,7 @@ data FileInfo = FileInfo } deriving stock (Show, Generic) deriving anyclass NFData + makeLenses ''FileInfo instance Default FileInfo where diff --git a/src/Xrefcheck/Scanners/Markdown.hs b/src/Xrefcheck/Scanners/Markdown.hs index 92942a06..0db2cf3d 100644 --- a/src/Xrefcheck/Scanners/Markdown.hs +++ b/src/Xrefcheck/Scanners/Markdown.hs @@ -18,11 +18,13 @@ module Xrefcheck.Scanners.Markdown import Universum import CMarkGFM (Node (..), NodeType (..), PosInfo (..), commonmarkToNode) +import Control.Lens hiding ((^?)) import Control.Monad.Except (MonadError, throwError) import Data.Aeson.TH (deriveFromJSON) import Data.ByteString.Lazy qualified as BSL import Data.DList qualified as DList import Data.Default (def) +import Data.List (isSubsequenceOf) import Data.Text qualified as T import Data.Text.Lazy qualified as LT import Fmt (Buildable (..), blockListF, nameF, (+|), (|+)) @@ -37,6 +39,8 @@ data MarkdownConfig = MarkdownConfig deriveFromJSON aesonConfigOption ''MarkdownConfig +makePrisms ''NodeType + defGithubMdConfig :: MarkdownConfig defGithubMdConfig = MarkdownConfig { mcFlavor = GitHub @@ -176,8 +180,34 @@ nodeExtractInfo input@(Node _ _ nSubs) = do _ -> return mempty copyPaste :: Node -> m FileInfoDiff + copyPaste (Node _ (LIST _) nodes) = do + case items of + top : rest | urlIsASubsequence top -> do + let bad = filter (not . urlIsASubsequence) rest + pure mempty { _fidCopyPastes = DList.fromList bad } + _ -> do + pure mempty + where + items = do + (_, nodes', _) <- takeOnly _ITEM nodes + (_, nodes'', _) <- takeOnly _PARAGRAPH nodes' + take 1 $ do + (_, texts, (url, _)) <- takeOnly _LINK nodes'' + (pos, _, txt) <- take 1 $ takeOnly _TEXT texts + return (CopyPaste url txt (toPosition pos)) + copyPaste _ = pure mempty + takeOnly prizm list = do + Node pos hdr nodes <- list + case hdr^?prizm of + Just res -> return (pos, nodes, res) + Nothing -> [] + +urlIsASubsequence :: CopyPaste -> Bool +urlIsASubsequence paste = + T.unpack (cpAnchorText paste) `isSubsequenceOf` T.unpack (cpPlainText paste) + merge :: (Monad m, Monoid b) => [a -> m b] -> a -> m b merge fs a = mconcat <$> traverse ($ a) fs diff --git a/src/Xrefcheck/Verify.hs b/src/Xrefcheck/Verify.hs index f3be2154..92606550 100644 --- a/src/Xrefcheck/Verify.hs +++ b/src/Xrefcheck/Verify.hs @@ -27,6 +27,7 @@ import Universum import Control.Concurrent.Async (wait, withAsync) import Control.Exception (throwIO) import Control.Monad.Except (MonadError (..)) +import Data.Bits (toIntegralSized) import Data.ByteString qualified as BS import Data.Map qualified as M import Data.Text qualified as T @@ -49,7 +50,6 @@ import Text.Regex.TDFA.Text (Regex, regexec) import Text.URI (Authority (..), URI (..), mkURI) import Time (RatioNat, Second, Time (..), ms, threadDelay, timeout) -import Data.Bits (toIntegralSized) import Xrefcheck.Config import Xrefcheck.Core import Xrefcheck.Orphans () @@ -70,9 +70,7 @@ deriving newtype instance Semigroup (VerifyResult e) deriving newtype instance Monoid (VerifyResult e) instance Buildable e => Buildable (VerifyResult e) where - build vr = case verifyErrors vr of - Nothing -> "ok" - Just errs -> listF errs + build vr = maybe "ok" listF (verifyErrors vr) verifyOk :: VerifyResult e -> Bool verifyOk (VerifyResult errors) = null errors @@ -114,6 +112,7 @@ data VerifyError | ExternalFtpException FTPException | FtpEntryDoesNotExist FilePath | ExternalResourceSomeError Text + | PossiblyIncorrectCopyPaste Text Text deriving stock (Show, Eq) instance Buildable VerifyError where @@ -156,10 +155,15 @@ instance Buildable VerifyError where "⛂ FTP exception (" +| err |+ ")\n" FtpEntryDoesNotExist entry -> - "⛂ File or directory does not exist:\n" +| entry |+ "\n" + "⛂ File or directory does not exist:\n" +| entry |+ "\n" ExternalResourceSomeError err -> "⛂ " +| build err |+ "\n\n" + + PossiblyIncorrectCopyPaste url text -> + "⛂ Possibly incorrect copy-paste in list with references\n" +| + " the url is " +| build url |+ "\n " +| + " but the text is " +| build text |+ "\n\n" where anchorHints = \case [] -> "\n" @@ -219,10 +223,19 @@ verifyRepo progressRef <- newIORef $ initVerifyProgress (map snd toScan) + errorss <- for (M.toList repoInfo) $ \(file, info) -> do + let pasta = _fiCopyPastes info + return + $ VerifyResult + $ fmap (\(CopyPaste url txt pos) -> + WithReferenceLoc file (Reference "" "" Nothing pos) + $ PossiblyIncorrectCopyPaste url txt) + pasta + accumulated <- withAsync (printer progressRef) $ \_ -> forConcurrentlyCaching toScan ifExternalThenCache $ \(file, ref) -> verifyReference config mode progressRef repoInfo' root file ref - return $ fold accumulated + return $ fold errorss <> fold accumulated where printer progressRef = forever $ do readIORef progressRef >>= reprintAnalyseProgress rw mode diff --git a/tests/Test/Xrefcheck/CopyPasteInListsSpec.hs b/tests/Test/Xrefcheck/CopyPasteInListsSpec.hs new file mode 100644 index 00000000..dac6b9d4 --- /dev/null +++ b/tests/Test/Xrefcheck/CopyPasteInListsSpec.hs @@ -0,0 +1,24 @@ +{- SPDX-FileCopyrightText: 2019 Serokell + - + - SPDX-License-Identifier: MPL-2.0 + -} + +module Test.Xrefcheck.CopyPasteInListsSpec where + +import Universum + +import Test.Hspec (Spec, describe, it, shouldBe) + +import Test.Xrefcheck.Util +import Xrefcheck.Core + +spec :: Spec +spec = do + describe "Possibly incorrect copy-paste" $ do + for_ allFlavors $ \fl -> do + it ("is detected (" <> show fl <> ")") $ do + fi <- getFI fl "tests/markdowns/without-annotations/copy-paste_in_lists.md" + getPasta fi `shouldBe`[("a", "c")] + where + getPasta :: FileInfo -> [(Text, Text)] + getPasta fi = map (cpAnchorText &&& cpPlainText) $ fi ^. fiCopyPastes diff --git a/tests/markdowns/without-annotations/copy-paste_in_lists.md b/tests/markdowns/without-annotations/copy-paste_in_lists.md new file mode 100644 index 00000000..44b42b17 --- /dev/null +++ b/tests/markdowns/without-annotations/copy-paste_in_lists.md @@ -0,0 +1,17 @@ + + +A list with bad copy-paste: + +- [a](a) e +- [b](b) e +- [c](a) e + +A list that is completely fine: + +- [a](a) d +- [b](b) d +- [c](c) d From 034bef52d70a80635337764280d0ba0cf711cd66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D0=BD=D0=B4=D1=80=D0=B5=D0=B5=D0=B2=20=D0=9A=D0=B8?= =?UTF-8?q?=D1=80=D0=B8=D0=BB=D0=BB?= Date: Tue, 9 Nov 2021 20:30:15 +0400 Subject: [PATCH 7/7] Restore gist extraction --- src/Xrefcheck/Scanners/Markdown.hs | 5 ++++- tests/Test/Xrefcheck/CopyPasteInListsSpec.hs | 2 +- .../without-annotations/copy-paste_in_lists.md | 12 ++++++------ 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/Xrefcheck/Scanners/Markdown.hs b/src/Xrefcheck/Scanners/Markdown.hs index 0db2cf3d..711c3b24 100644 --- a/src/Xrefcheck/Scanners/Markdown.hs +++ b/src/Xrefcheck/Scanners/Markdown.hs @@ -22,6 +22,7 @@ import Control.Lens hiding ((^?)) import Control.Monad.Except (MonadError, throwError) import Data.Aeson.TH (deriveFromJSON) import Data.ByteString.Lazy qualified as BSL +import Data.Char (isAlpha) import Data.DList qualified as DList import Data.Default (def) import Data.List (isSubsequenceOf) @@ -206,7 +207,9 @@ nodeExtractInfo input@(Node _ _ nSubs) = do urlIsASubsequence :: CopyPaste -> Bool urlIsASubsequence paste = - T.unpack (cpAnchorText paste) `isSubsequenceOf` T.unpack (cpPlainText paste) + gist (cpAnchorText paste) `isSubsequenceOf` gist (cpPlainText paste) + where + gist = T.unpack . T.toLower . T.filter isAlpha merge :: (Monad m, Monoid b) => [a -> m b] -> a -> m b merge fs a = mconcat <$> traverse ($ a) fs diff --git a/tests/Test/Xrefcheck/CopyPasteInListsSpec.hs b/tests/Test/Xrefcheck/CopyPasteInListsSpec.hs index dac6b9d4..08a0a3ab 100644 --- a/tests/Test/Xrefcheck/CopyPasteInListsSpec.hs +++ b/tests/Test/Xrefcheck/CopyPasteInListsSpec.hs @@ -18,7 +18,7 @@ spec = do for_ allFlavors $ \fl -> do it ("is detected (" <> show fl <> ")") $ do fi <- getFI fl "tests/markdowns/without-annotations/copy-paste_in_lists.md" - getPasta fi `shouldBe`[("a", "c")] + getPasta fi `shouldBe`[("foo-bar","Foo Kek")] where getPasta :: FileInfo -> [(Text, Text)] getPasta fi = map (cpAnchorText &&& cpPlainText) $ fi ^. fiCopyPastes diff --git a/tests/markdowns/without-annotations/copy-paste_in_lists.md b/tests/markdowns/without-annotations/copy-paste_in_lists.md index 44b42b17..aa88875c 100644 --- a/tests/markdowns/without-annotations/copy-paste_in_lists.md +++ b/tests/markdowns/without-annotations/copy-paste_in_lists.md @@ -6,12 +6,12 @@ A list with bad copy-paste: -- [a](a) e -- [b](b) e -- [c](a) e +- [Foo Bar](foo-bar) e +- [Foo Qux](foo-qux) e +- [Foo Kek](foo-bar) e A list that is completely fine: -- [a](a) d -- [b](b) d -- [c](c) d +- [Foo Bar](foo-bar) e +- [Foo Qux](foo-qux) e +- [Foo Kek](foo-kek) e