-
Notifications
You must be signed in to change notification settings - Fork 5
[#64] Refactor markdown scanner #238
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,14 +19,15 @@ module Xrefcheck.Scanners.Markdown | |
|
|
||
| import Universum | ||
|
|
||
| import CMarkGFM | ||
| (Node (..), NodeType (..), PosInfo (..), commonmarkToNode, extAutolink, optFootnotes) | ||
| import CMarkGFM (NodeType (..), PosInfo (..), commonmarkToNode, extAutolink, optFootnotes) | ||
| import CMarkGFM qualified as C | ||
| import Control.Lens (_Just, makeLenses, makeLensesFor, (.=)) | ||
| import Control.Monad.Trans.Writer.CPS (Writer, runWriter, tell) | ||
| import Data.Aeson (FromJSON (..), genericParseJSON) | ||
| import Data.ByteString.Lazy qualified as BSL | ||
| import Data.DList qualified as DList | ||
| import Data.Default (def) | ||
| import Data.List (span) | ||
| import Data.Text qualified as T | ||
| import Data.Text.Lazy qualified as LT | ||
| import Fmt (Buildable (..), nameF) | ||
|
|
@@ -49,8 +50,8 @@ defGithubMdConfig = MarkdownConfig | |
| { mcFlavor = GitHub | ||
| } | ||
|
|
||
| instance Buildable Node where | ||
| build (Node _mpos ty mSubs) = nameF (show ty) $ | ||
| instance Buildable C.Node where | ||
| build (C.Node _mpos ty mSubs) = nameF (show ty) $ | ||
| maybe "[]" interpolateBlockListF (nonEmpty mSubs) | ||
|
|
||
| toPosition :: Maybe PosInfo -> Position | ||
|
|
@@ -67,16 +68,16 @@ toPosition = Position . \case | |
| |] | ||
|
|
||
| -- | Extract text from the topmost node. | ||
| nodeExtractText :: Node -> Text | ||
| nodeExtractText :: (C.Node) -> Text | ||
| nodeExtractText = T.strip . mconcat . map extractText . nodeFlatten | ||
| where | ||
| extractText = \case | ||
| TEXT t -> t | ||
| CODE t -> t | ||
| _ -> "" | ||
|
|
||
| nodeFlatten :: Node -> [NodeType] | ||
| nodeFlatten (Node _pos ty subs) = ty : concatMap nodeFlatten subs | ||
| nodeFlatten :: (C.Node) -> [NodeType] | ||
| nodeFlatten (C.Node _pos ty subs) = ty : concatMap nodeFlatten subs | ||
|
|
||
|
|
||
| data IgnoreMode | ||
|
|
@@ -140,81 +141,132 @@ initialScannerState = ScannerState | |
| type ScannerM a = StateT ScannerState (Writer [ScanError]) a | ||
|
|
||
| -- | A fold over a `Node`. | ||
| cataNode :: (Maybe PosInfo -> NodeType -> [c] -> c) -> Node -> c | ||
| cataNode f (Node pos ty subs) = f pos ty (cataNode f <$> subs) | ||
| cataNode :: (Maybe PosInfo -> NodeType -> [c] -> c) -> C.Node -> c | ||
| cataNode f (C.Node pos ty subs) = f pos ty (cataNode f <$> subs) | ||
|
|
||
| -- | Sets correct @_ssParentNodeType@ before running scanner on each node | ||
| -- | Sets correct @_ssParentNodeType@ before running scanner on each node. | ||
| cataNodeWithParentNodeInfo | ||
| :: (Maybe PosInfo -> NodeType -> [ScannerM a] -> ScannerM a) | ||
| -> Node | ||
| -> C.Node | ||
| -> ScannerM a | ||
| cataNodeWithParentNodeInfo f node = cataNode f' node | ||
| where | ||
| f' pos ty childScanners = f pos ty $ | ||
| map (ssParentNodeType .= Just ty >>) childScanners | ||
|
|
||
| -- | Find ignore annotations (ignore paragraph and ignore link) | ||
| -- and remove nodes that should be ignored | ||
| removeIgnored :: FilePath -> Node -> Writer [ScanError] Node | ||
| removeIgnored fp = withIgnoreMode . cataNodeWithParentNodeInfo remove | ||
| -- and remove nodes that should be ignored. | ||
| processAnnotations :: FilePath -> C.Node -> Writer [ScanError] C.Node | ||
| processAnnotations fp = withIgnoreMode . cataNodeWithParentNodeInfo process | ||
| where | ||
| remove | ||
| process | ||
| :: Maybe PosInfo | ||
| -> NodeType | ||
| -> [ScannerM Node] | ||
| -> ScannerM Node | ||
| remove pos ty subs = do | ||
| let node = Node pos ty [] | ||
| scan <- use ssIgnore >>= \e -> do | ||
| -> [ScannerM C.Node] | ||
| -> ScannerM C.Node | ||
| process pos ty subs = do | ||
| let node = C.Node pos ty [] | ||
| use ssIgnore >>= \ign -> do | ||
| -- When no `Ignore` state is set check next node for annotation, | ||
| -- if found then set it as new `IgnoreMode` otherwise skip node. | ||
| let mbAnnotation = getAnnotation node | ||
| case mbAnnotation of | ||
| Just ann -> handleAnnotation pos ty ann | ||
| Nothing -> case e of | ||
| Nothing -> Node pos ty <$> sequence subs | ||
| Just (Ignore mode modePos) -> | ||
| case (mode, ty) of | ||
| -- We expect to find a paragraph immediately after the | ||
| -- `ignore paragraph` annotanion. If the paragraph is not | ||
| -- found we should report an error. | ||
| (IMSParagraph, PARAGRAPH) -> (ssIgnore .= Nothing) $> defNode | ||
| (IMSParagraph, x) -> do | ||
| lift . tell . makeError modePos fp . ParagraphErr $ prettyType x | ||
| ssIgnore .= Nothing | ||
| Node pos ty <$> sequence subs | ||
|
|
||
| (IMSLink _, LINK {}) -> do | ||
| ssIgnore .= Nothing | ||
| return defNode | ||
| (IMSLink _, IMAGE {}) -> do | ||
| ssIgnore .= Nothing | ||
| return defNode | ||
| (IMSLink ignoreLinkState, _) -> do | ||
| when (ignoreLinkState == ExpectingLinkInSubnodes) $ | ||
| ssIgnore . _Just . ignoreMode .= IMSLink ParentExpectsLink | ||
| node' <- Node pos ty <$> sequence subs | ||
| when (ignoreLinkState == ExpectingLinkInSubnodes) $ do | ||
| currentIgnore <- use ssIgnore | ||
| case currentIgnore of | ||
| Just (Ignore {_ignoreMode = IMSLink ParentExpectsLink}) -> do | ||
| lift $ tell $ makeError modePos fp LinkErr | ||
| ssIgnore .= Nothing | ||
| _ -> pass | ||
| return node' | ||
|
|
||
| when (ty == PARAGRAPH) $ use ssIgnore >>= \case | ||
| Nothing -> do | ||
| case ty of | ||
| PARAGRAPH -> handleParagraph ign pos ty subs | ||
| LINK {} -> handleLink ign pos ty subs | ||
| IMAGE {} -> handleLink ign pos ty subs | ||
| _ -> handleOther ign pos ty subs | ||
|
|
||
| handleLink :: | ||
| Maybe Ignore -> | ||
| Maybe PosInfo -> | ||
| NodeType -> | ||
| [ScannerM C.Node] -> | ||
| ScannerM C.Node | ||
| handleLink ign pos ty subs = do | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now all the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, they always correspond to
|
||
| let traverseChildren = C.Node pos ty <$> sequence subs | ||
| -- It can be checked that it's correct for all the cases | ||
| ssIgnore .= Nothing | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is where I'd agree with @aeqz on the maintainability issue. Currently, the comment at line above is correct, but if someday someone adds some Also, AFAIU in the case of
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
It seems that this code is fully made up of such implicit dependencies, because the main reason is how we use annotations. We put an annotation somewhere in the text, and then the further behavior depends on the next nodes.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In case of |
||
|
|
||
| case ign of | ||
| Nothing -> traverseChildren | ||
| Just (Ignore IMSParagraph modePos) -> do | ||
| reportExpectedParagraphAfterIgnoreAnnotation modePos ty | ||
| traverseChildren | ||
| Just (Ignore (IMSLink _) _) -> do | ||
| pure defNode | ||
|
|
||
| handleParagraph :: | ||
| Maybe Ignore -> | ||
| Maybe PosInfo -> | ||
| NodeType -> | ||
| [ScannerM C.Node] -> | ||
| ScannerM C.Node | ||
| handleParagraph ign pos ty subs = do | ||
| let traverseChildren = C.Node pos ty <$> sequence subs | ||
| node <- case ign of | ||
| Nothing -> traverseChildren | ||
| Just (Ignore IMSParagraph _) -> do | ||
| ssIgnore .= Nothing | ||
| pure defNode | ||
| Just (Ignore (IMSLink ignoreLinkState) modePos) -> | ||
| traverseNodeWithLinkExpected ignoreLinkState modePos pos ty subs | ||
|
|
||
| use ssIgnore >>= \case | ||
| Just (Ignore (IMSLink ExpectingLinkInParagraph) pragmaPos) -> | ||
| lift $ tell $ makeError pragmaPos fp LinkErr | ||
| _ -> pass | ||
|
|
||
| return scan | ||
| pure node | ||
|
|
||
| handleOther :: | ||
| Maybe Ignore -> | ||
| Maybe PosInfo -> | ||
| NodeType -> | ||
| [ScannerM C.Node] -> | ||
| ScannerM C.Node | ||
| handleOther ign pos ty subs = do | ||
| let traverseChildren = C.Node pos ty <$> sequence subs | ||
|
|
||
| case ign of | ||
| Nothing -> traverseChildren | ||
| Just (Ignore IMSParagraph modePos) -> do | ||
| reportExpectedParagraphAfterIgnoreAnnotation modePos ty | ||
| ssIgnore .= Nothing | ||
| traverseChildren | ||
| Just (Ignore (IMSLink ignoreLinkState) modePos) -> do | ||
| traverseNodeWithLinkExpected ignoreLinkState modePos pos ty subs | ||
|
|
||
| reportExpectedParagraphAfterIgnoreAnnotation :: Maybe PosInfo -> NodeType -> ScannerM () | ||
| reportExpectedParagraphAfterIgnoreAnnotation modePos ty = | ||
| lift . tell . makeError modePos fp . ParagraphErr $ prettyType ty | ||
|
|
||
| traverseNodeWithLinkExpected :: | ||
| IgnoreLinkState -> | ||
| Maybe PosInfo -> | ||
| Maybe PosInfo -> | ||
| NodeType -> | ||
| [ScannerM C.Node] -> | ||
| ScannerM C.Node | ||
| traverseNodeWithLinkExpected ignoreLinkState modePos pos ty subs = do | ||
| when (ignoreLinkState == ExpectingLinkInSubnodes) $ | ||
| ssIgnore . _Just . ignoreMode .= IMSLink ParentExpectsLink | ||
|
||
| node' <- C.Node pos ty <$> sequence subs | ||
| when (ignoreLinkState == ExpectingLinkInSubnodes) $ do | ||
| currentIgnore <- use ssIgnore | ||
| case currentIgnore of | ||
| Just (Ignore {_ignoreMode = IMSLink ParentExpectsLink}) -> do | ||
| lift $ tell $ makeError modePos fp LinkErr | ||
| ssIgnore .= Nothing | ||
| _ -> pass | ||
| return node' | ||
|
|
||
| handleAnnotation | ||
| :: Maybe PosInfo | ||
| -> NodeType | ||
| -> GetAnnotation | ||
| -> ScannerM Node | ||
| -> ScannerM C.Node | ||
| handleAnnotation pos nodeType = \case | ||
| IgnoreAnnotation mode -> do | ||
|
||
| let reportIfThereWasAnnotation :: ScannerM () | ||
|
|
@@ -252,16 +304,16 @@ removeIgnored fp = withIgnoreMode . cataNodeWithParentNodeInfo remove | |
| lift . tell $ makeError correctPos fp $ UnrecognisedErr msg | ||
| pure defNode | ||
| where | ||
| correctPos = getPosition $ Node pos nodeType [] | ||
| correctPos = getPosition $ C.Node pos nodeType [] | ||
|
|
||
| prettyType :: NodeType -> Text | ||
| prettyType ty = | ||
| let mType = safeHead $ words $ show ty | ||
| in fromMaybe "" mType | ||
|
|
||
| withIgnoreMode | ||
| :: ScannerM Node | ||
| -> Writer [ScanError] Node | ||
| :: ScannerM C.Node | ||
| -> Writer [ScanError] C.Node | ||
| withIgnoreMode action = action `runStateT` initialScannerState >>= \case | ||
| -- We expect `Ignore` state to be `Nothing` when we reach EOF, | ||
| -- otherwise that means there was an annotation that didn't match | ||
|
|
@@ -276,8 +328,8 @@ removeIgnored fp = withIgnoreMode . cataNodeWithParentNodeInfo remove | |
| (node, _) -> pure node | ||
|
|
||
| -- | Custom `foldMap` for source tree. | ||
| foldNode :: (Monoid a, Monad m) => (Node -> m a) -> Node -> m a | ||
| foldNode action node@(Node _ _ subs) = do | ||
| foldNode :: (Monoid a, Monad m) => (C.Node -> m a) -> C.Node -> m a | ||
| foldNode action node@(C.Node _ _ subs) = do | ||
| a <- action node | ||
| b <- concatForM subs (foldNode action) | ||
| return (a <> b) | ||
|
|
@@ -287,16 +339,19 @@ type ExtractorM a = ReaderT MarkdownConfig (Writer [ScanError]) a | |
| -- | Extract information from source tree. | ||
| nodeExtractInfo | ||
| :: FilePath | ||
| -> Node | ||
| -> C.Node | ||
| -> ExtractorM FileInfo | ||
| nodeExtractInfo fp input@(Node _ _ nSubs) = do | ||
| if checkIgnoreAllFile nSubs | ||
| nodeExtractInfo fp (C.Node nPos nTy nSubs) = do | ||
| let (ignoreFile, contentNodes) = checkGlobalAnnotations nSubs | ||
| if ignoreFile | ||
| then return def | ||
| else diffToFileInfo <$> (foldNode extractor =<< lift (removeIgnored fp input)) | ||
| else diffToFileInfo <$> | ||
| (lift (processAnnotations fp $ C.Node nPos nTy contentNodes) | ||
| >>= foldNode extractor) | ||
|
|
||
| where | ||
| extractor :: Node -> ExtractorM FileInfoDiff | ||
| extractor node@(Node pos ty _) = | ||
| extractor :: C.Node -> ExtractorM FileInfoDiff | ||
| extractor node@(C.Node pos ty _) = | ||
| case ty of | ||
| HTML_BLOCK _ -> do | ||
| return mempty | ||
|
|
@@ -349,24 +404,31 @@ nodeExtractInfo fp input@(Node _ _ nSubs) = do | |
| (DList.singleton $ Reference {rName, rPos, rLink, rAnchor}) | ||
| DList.empty | ||
|
|
||
| -- | Check if there is `ignore all` at the beginning of the file, | ||
| -- ignoring preceding comments if there are any. | ||
| checkIgnoreAllFile :: [Node] -> Bool | ||
| checkIgnoreAllFile nodes = | ||
| let isSimpleComment :: Node -> Bool | ||
| isSimpleComment node = isComment node && not (isIgnoreFile node) | ||
|
|
||
| mIgnoreFile = safeHead $ dropWhile isSimpleComment nodes | ||
| in maybe False isIgnoreFile mIgnoreFile | ||
| -- | Check for global annotations, ignoring simple comments if there are any. | ||
| checkGlobalAnnotations :: [C.Node] -> (Bool, [C.Node]) | ||
| checkGlobalAnnotations nodes = do | ||
| let (headerNodes, contentsNodes) = span isHeaderNode nodes | ||
| ignoreFile = any isIgnoreFile headerNodes | ||
| (ignoreFile, contentsNodes) | ||
| where | ||
| isComment :: Node -> Bool | ||
| isComment = isJust . getCommentContent | ||
| isSimpleComment :: C.Node -> Bool | ||
| isSimpleComment node = do | ||
| let isComment = isJust $ getCommentContent node | ||
| isNotXrefcheckAnnotation = isNothing $ getXrefcheckContent node | ||
| isComment && isNotXrefcheckAnnotation | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good renaming and use of I'm not sure though why providing mere
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here we want to split header nodes and other contents of the file. So if for example Hm, but also this function will stop at an invalid annotation. Maybe it's better to allow it too, and just skip it reporting an error.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, no, I think it's better to stop at incorrect annotations 🤔 |
||
|
|
||
| isIgnoreFile :: Node -> Bool | ||
| isIgnoreFile :: C.Node -> Bool | ||
| isIgnoreFile = (Just (IgnoreAnnotation IMAll) ==) . getAnnotation | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we can have a function like
and use it here. Also, if this function has no wildcard pattern matchings, the compiler will ask us if we want any new annotation that we add in the future to be global or not.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently there can be only one type of annotation, which is |
||
|
|
||
| defNode :: Node | ||
| defNode = Node Nothing DOCUMENT [] -- hard-coded default Node | ||
| isHeaderNode :: C.Node -> Bool | ||
| isHeaderNode node = | ||
| any ($ node) | ||
| [ isSimpleComment | ||
| , isIgnoreFile | ||
| ] | ||
|
|
||
| defNode :: C.Node | ||
| defNode = C.Node Nothing DOCUMENT [] -- hard-coded default Node | ||
|
|
||
| makeError | ||
| :: Maybe PosInfo | ||
|
|
@@ -375,17 +437,17 @@ makeError | |
| -> [ScanError] | ||
| makeError pos fp errDescription = one $ ScanError (toPosition pos) fp errDescription | ||
|
|
||
| getCommentContent :: Node -> Maybe Text | ||
| getCommentContent :: C.Node -> Maybe Text | ||
| getCommentContent node = do | ||
| txt <- getHTMLText node | ||
| T.stripSuffix "-->" =<< T.stripPrefix "<!--" (T.strip txt) | ||
|
|
||
| getHTMLText :: Node -> Maybe Text | ||
| getHTMLText (Node _ (HTML_BLOCK txt) _) = Just txt | ||
| getHTMLText (Node _ (HTML_INLINE txt) _) = Just txt | ||
| getHTMLText :: C.Node -> Maybe Text | ||
| getHTMLText (C.Node _ (HTML_BLOCK txt) _) = Just txt | ||
| getHTMLText (C.Node _ (HTML_INLINE txt) _) = Just txt | ||
| getHTMLText _ = Nothing | ||
|
|
||
| getXrefcheckContent :: Node -> Maybe Text | ||
| getXrefcheckContent :: C.Node -> Maybe Text | ||
| getXrefcheckContent node = | ||
| let notStripped = T.stripPrefix "xrefcheck:" . T.strip =<< | ||
| getCommentContent node | ||
|
|
@@ -397,14 +459,14 @@ getXrefcheckContent node = | |
| -- As our annotations are always oneliners, we can fix this by simply setting | ||
| -- end line equals to start line and calculating end column from start column | ||
| -- and annotation length. | ||
| getPosition :: Node -> Maybe PosInfo | ||
| getPosition node@(Node pos _ _) = do | ||
| getPosition :: C.Node -> Maybe PosInfo | ||
| getPosition node@(C.Node pos _ _) = do | ||
| annLength <- length . T.strip <$> getHTMLText node | ||
| PosInfo sl sc _ _ <- pos | ||
| pure $ PosInfo sl sc sl (sc + annLength - 1) | ||
|
|
||
| -- | Extract `IgnoreMode` if current node is xrefcheck annotation. | ||
|
||
| getAnnotation :: Node -> Maybe GetAnnotation | ||
| getAnnotation :: C.Node -> Maybe GetAnnotation | ||
| getAnnotation node = getXrefcheckContent node <&> textToMode | ||
|
|
||
| textToMode :: Text -> GetAnnotation | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmhm, looking at the result of this change, I feel quite skeptical in fact.
When imagining the implementation, it seems simpler to keep an annotation in mind and think how it affects markdown nodes, rather than mentally sorting through all types of nodes (hard) and for each think how it is affected by each of our annotations (keeping all annotations in mind can become hard in the future). And this code goes against this model of thinking.
A related thing: code locality issues, what happens in
handleParagraphseems to affect what happens inhandleLink, this makes reasoning about the code correctness harder.Perhaps you applied this refactoring to make the code shorter? We really spared several lines (and on one such place I left a comment because I think it is arguable), but overall the code seems to take more space now even if we don't take the function signatures into account.
I mostly agree with other minor changes in this commit, but the core part of this commit I find suspicious.
Could you tell which benefits, in your opinion, this rewrite gives?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For me, it's easier to reason about code behavior when a node type is known, because our types for ignore modes are, IMHO, rather confusing and hard to understand. And, in contrast, node type is something very clear and easy to work with, so I preferred it.
Things only gets worse if there are some different types of annotation (the primary reason of the whole this refactor). We'll have to handle a lot of different cases for all types of annotations and nodes.