properly decode fetched title #8

This commit is contained in:
Jon Schoning 2020-01-19 00:47:53 -06:00
parent 7683c3413b
commit 9dabf770a6
3 changed files with 33 additions and 22 deletions

View file

@ -4,7 +4,7 @@ cabal-version: 1.12
--
-- see: https://github.com/sol/hpack
--
-- hash: e38da659643e4b4691321fcadef6f3700e48643736805020965305c3348a750e
-- hash: 9ee2c0431ff44c217101ae145895fa4c4df6141ac59c65d5d35f586938ce6657
name: espial
version: 0.0.8
@ -147,6 +147,7 @@ library
, foreign-store
, hjsmin >=0.1 && <0.3
, hscolour
, html-entities
, http-api-data >=0.3.4
, http-client
, http-client-tls >=0.3 && <0.4
@ -223,6 +224,7 @@ executable espial
, foreign-store
, hjsmin >=0.1 && <0.3
, hscolour
, html-entities
, http-api-data >=0.3.4
, http-client
, http-client-tls >=0.3 && <0.4
@ -295,6 +297,7 @@ executable migration
, foreign-store
, hjsmin >=0.1 && <0.3
, hscolour
, html-entities
, http-api-data >=0.3.4
, http-client
, http-client-tls >=0.3 && <0.4
@ -373,6 +376,7 @@ test-suite test
, hjsmin >=0.1 && <0.3
, hscolour
, hspec >=2.0.0
, html-entities
, http-api-data >=0.3.4
, http-client
, http-client-tls >=0.3 && <0.4

View file

@ -142,6 +142,7 @@ dependencies:
- transformers >= 0.2.2
- wai-middleware-metrics
- parser-combinators
- html-entities
# The library contains all of our application code. The executable
# defined below is just a thin wrapper.

View file

@ -2,7 +2,9 @@ module Handler.Archive where
import Import
import Data.Function ((&))
import qualified Data.Attoparsec.ByteString.Char8 as AP
import Data.Char (chr)
import qualified Data.Attoparsec.ByteString.Char8 as AP8
import qualified Data.Attoparsec.ByteString as AP
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString.Char8 as BS8
@ -11,6 +13,8 @@ import qualified Network.HTTP.Client.TLS as NH
import qualified Network.HTTP.Types.Status as NH
import qualified Web.FormUrlEncoded as WH
import qualified Control.Monad.Metrics as MM
import HTMLEntities.Decoder (htmlEncodedText)
import Data.Text.Lazy.Builder (toLazyText)
shouldArchiveBookmark :: User -> Key Bookmark -> Handler Bool
shouldArchiveBookmark user kbid = do
@ -88,36 +92,38 @@ _fetchArchiveSubmitInfo = do
res <- liftIO $ NH.httpLbs (buildSimpleRequest "https://archive.li/") =<< NH.getGlobalManager
MM.increment ("archive.fetchSubmitId_status_" <> (pack.show) (NH.statusCode (NH.responseStatus res)))
let body = LBS.toStrict (responseBody res)
action = _parseSubstring (AP.string "action=\"") (AP.notChar '"') body
submitId = _parseSubstring (AP.string "submitid\" value=\"") (AP.notChar '"') body
action = _parseSubstring (AP8.string "action=\"") (AP8.notChar '"') body
submitId = _parseSubstring (AP8.string "submitid\" value=\"") (AP8.notChar '"') body
pure $ (,) <$> action <*> submitId
_archiveUserAgent :: ByteString
_archiveUserAgent = "espial"
_parseSubstring :: AP.Parser ByteString -> AP.Parser Char -> BS.ByteString -> Either String String
_parseSubstring :: AP8.Parser ByteString -> AP8.Parser Char -> BS.ByteString -> Either String String
_parseSubstring start inner res = do
(flip AP.parseOnly) res (skipAnyTill start >> AP.many1 inner)
(flip AP8.parseOnly) res (skipAnyTill start >> AP8.many1 inner)
where
skipAnyTill end = go where go = end *> pure () <|> AP.anyChar *> go
skipAnyTill end = go where go = end *> pure () <|> AP8.anyChar *> go
fetchPageTitle :: String -> Handler (Either String String)
fetchPageTitle url = do
MM.increment "fetchPageTitle"
res <- liftIO $ NH.httpLbs (buildSimpleRequest url) =<< NH.getGlobalManager
let body = LBS.toStrict (responseBody res)
title = (flip AP.parseOnly) body $ do
_ <- skipAnyTill (AP.string "<title")
_ <- skipAnyTill (AP.string ">")
AP.many1 (AP.notChar '<')
pure title
`catch` (\(e::SomeException) -> do
MM.increment "fetchPageTitle.error"
$(logError) $ (pack.show) e
pure (Left (show e)))
fetchPageTitle :: String -> Handler (Either String Text)
fetchPageTitle url =
do MM.increment "fetchPageTitle"
res <- liftIO $ NH.httpLbs (buildSimpleRequest url) =<< NH.getGlobalManager
let body = LBS.toStrict (responseBody res)
pure (decodeHtmlBs <$> parseTitle body)
`catch` (\(e :: SomeException) -> do
MM.increment "fetchPageTitle.error"
$(logError) $ (pack . show) e
pure (Left (show e)))
where
skipAnyTill end = go where go = end *> pure () <|> AP.anyChar *> go
parseTitle bs =
(flip AP.parseOnly) bs $ do
_ <- skipAnyTill (AP.string "<title")
_ <- skipAnyTill (AP.string ">")
AP.takeTill (\w -> chr (fromEnum w) == '<')
decodeHtmlBs = toStrict . toLazyText . htmlEncodedText . decodeUtf8
skipAnyTill end = go where go = end *> pure () <|> AP.anyWord8 *> go
buildSimpleRequest :: String -> Request
buildSimpleRequest url =