From 9dabf770a6ebae2848eb09b3c53bacb9544f41b9 Mon Sep 17 00:00:00 2001 From: Jon Schoning Date: Sun, 19 Jan 2020 00:47:53 -0600 Subject: [PATCH] properly decode fetched title #8 --- espial.cabal | 6 +++++- package.yaml | 1 + src/Handler/Archive.hs | 48 ++++++++++++++++++++++++------------------ 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/espial.cabal b/espial.cabal index 5705963..aa9486e 100644 --- a/espial.cabal +++ b/espial.cabal @@ -4,7 +4,7 @@ cabal-version: 1.12 -- -- see: https://github.com/sol/hpack -- --- hash: e38da659643e4b4691321fcadef6f3700e48643736805020965305c3348a750e +-- hash: 9ee2c0431ff44c217101ae145895fa4c4df6141ac59c65d5d35f586938ce6657 name: espial version: 0.0.8 @@ -147,6 +147,7 @@ library , foreign-store , hjsmin >=0.1 && <0.3 , hscolour + , html-entities , http-api-data >=0.3.4 , http-client , http-client-tls >=0.3 && <0.4 @@ -223,6 +224,7 @@ executable espial , foreign-store , hjsmin >=0.1 && <0.3 , hscolour + , html-entities , http-api-data >=0.3.4 , http-client , http-client-tls >=0.3 && <0.4 @@ -295,6 +297,7 @@ executable migration , foreign-store , hjsmin >=0.1 && <0.3 , hscolour + , html-entities , http-api-data >=0.3.4 , http-client , http-client-tls >=0.3 && <0.4 @@ -373,6 +376,7 @@ test-suite test , hjsmin >=0.1 && <0.3 , hscolour , hspec >=2.0.0 + , html-entities , http-api-data >=0.3.4 , http-client , http-client-tls >=0.3 && <0.4 diff --git a/package.yaml b/package.yaml index f55d1c2..028a89a 100644 --- a/package.yaml +++ b/package.yaml @@ -142,6 +142,7 @@ dependencies: - transformers >= 0.2.2 - wai-middleware-metrics - parser-combinators +- html-entities # The library contains all of our application code. The executable # defined below is just a thin wrapper. diff --git a/src/Handler/Archive.hs b/src/Handler/Archive.hs index 9dda9cf..ba46915 100644 --- a/src/Handler/Archive.hs +++ b/src/Handler/Archive.hs @@ -2,7 +2,9 @@ module Handler.Archive where import Import import Data.Function ((&)) -import qualified Data.Attoparsec.ByteString.Char8 as AP +import Data.Char (chr) +import qualified Data.Attoparsec.ByteString.Char8 as AP8 +import qualified Data.Attoparsec.ByteString as AP import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Char8 as BS8 @@ -11,6 +13,8 @@ import qualified Network.HTTP.Client.TLS as NH import qualified Network.HTTP.Types.Status as NH import qualified Web.FormUrlEncoded as WH import qualified Control.Monad.Metrics as MM +import HTMLEntities.Decoder (htmlEncodedText) +import Data.Text.Lazy.Builder (toLazyText) shouldArchiveBookmark :: User -> Key Bookmark -> Handler Bool shouldArchiveBookmark user kbid = do @@ -88,36 +92,38 @@ _fetchArchiveSubmitInfo = do res <- liftIO $ NH.httpLbs (buildSimpleRequest "https://archive.li/") =<< NH.getGlobalManager MM.increment ("archive.fetchSubmitId_status_" <> (pack.show) (NH.statusCode (NH.responseStatus res))) let body = LBS.toStrict (responseBody res) - action = _parseSubstring (AP.string "action=\"") (AP.notChar '"') body - submitId = _parseSubstring (AP.string "submitid\" value=\"") (AP.notChar '"') body + action = _parseSubstring (AP8.string "action=\"") (AP8.notChar '"') body + submitId = _parseSubstring (AP8.string "submitid\" value=\"") (AP8.notChar '"') body pure $ (,) <$> action <*> submitId _archiveUserAgent :: ByteString _archiveUserAgent = "espial" -_parseSubstring :: AP.Parser ByteString -> AP.Parser Char -> BS.ByteString -> Either String String +_parseSubstring :: AP8.Parser ByteString -> AP8.Parser Char -> BS.ByteString -> Either String String _parseSubstring start inner res = do - (flip AP.parseOnly) res (skipAnyTill start >> AP.many1 inner) + (flip AP8.parseOnly) res (skipAnyTill start >> AP8.many1 inner) where - skipAnyTill end = go where go = end *> pure () <|> AP.anyChar *> go + skipAnyTill end = go where go = end *> pure () <|> AP8.anyChar *> go -fetchPageTitle :: String -> Handler (Either String String) -fetchPageTitle url = do - MM.increment "fetchPageTitle" - res <- liftIO $ NH.httpLbs (buildSimpleRequest url) =<< NH.getGlobalManager - let body = LBS.toStrict (responseBody res) - title = (flip AP.parseOnly) body $ do - _ <- skipAnyTill (AP.string "") - AP.many1 (AP.notChar '<') - pure title - `catch` (\(e::SomeException) -> do - MM.increment "fetchPageTitle.error" - $(logError) $ (pack.show) e - pure (Left (show e))) +fetchPageTitle :: String -> Handler (Either String Text) +fetchPageTitle url = + do MM.increment "fetchPageTitle" + res <- liftIO $ NH.httpLbs (buildSimpleRequest url) =<< NH.getGlobalManager + let body = LBS.toStrict (responseBody res) + pure (decodeHtmlBs <$> parseTitle body) + `catch` (\(e :: SomeException) -> do + MM.increment "fetchPageTitle.error" + $(logError) $ (pack . show) e + pure (Left (show e))) where - skipAnyTill end = go where go = end *> pure () <|> AP.anyChar *> go + parseTitle bs = + (flip AP.parseOnly) bs $ do + _ <- skipAnyTill (AP.string "") + AP.takeTill (\w -> chr (fromEnum w) == '<') + decodeHtmlBs = toStrict . toLazyText . htmlEncodedText . decodeUtf8 + skipAnyTill end = go where go = end *> pure () <|> AP.anyWord8 *> go buildSimpleRequest :: String -> Request buildSimpleRequest url =