properly decode fetched title #8

This commit is contained in:
Jon Schoning 2020-01-19 00:47:53 -06:00
parent 7683c3413b
commit 9dabf770a6
3 changed files with 33 additions and 22 deletions

View file

@ -4,7 +4,7 @@ cabal-version: 1.12
-- --
-- see: https://github.com/sol/hpack -- see: https://github.com/sol/hpack
-- --
-- hash: e38da659643e4b4691321fcadef6f3700e48643736805020965305c3348a750e -- hash: 9ee2c0431ff44c217101ae145895fa4c4df6141ac59c65d5d35f586938ce6657
name: espial name: espial
version: 0.0.8 version: 0.0.8
@ -147,6 +147,7 @@ library
, foreign-store , foreign-store
, hjsmin >=0.1 && <0.3 , hjsmin >=0.1 && <0.3
, hscolour , hscolour
, html-entities
, http-api-data >=0.3.4 , http-api-data >=0.3.4
, http-client , http-client
, http-client-tls >=0.3 && <0.4 , http-client-tls >=0.3 && <0.4
@ -223,6 +224,7 @@ executable espial
, foreign-store , foreign-store
, hjsmin >=0.1 && <0.3 , hjsmin >=0.1 && <0.3
, hscolour , hscolour
, html-entities
, http-api-data >=0.3.4 , http-api-data >=0.3.4
, http-client , http-client
, http-client-tls >=0.3 && <0.4 , http-client-tls >=0.3 && <0.4
@ -295,6 +297,7 @@ executable migration
, foreign-store , foreign-store
, hjsmin >=0.1 && <0.3 , hjsmin >=0.1 && <0.3
, hscolour , hscolour
, html-entities
, http-api-data >=0.3.4 , http-api-data >=0.3.4
, http-client , http-client
, http-client-tls >=0.3 && <0.4 , http-client-tls >=0.3 && <0.4
@ -373,6 +376,7 @@ test-suite test
, hjsmin >=0.1 && <0.3 , hjsmin >=0.1 && <0.3
, hscolour , hscolour
, hspec >=2.0.0 , hspec >=2.0.0
, html-entities
, http-api-data >=0.3.4 , http-api-data >=0.3.4
, http-client , http-client
, http-client-tls >=0.3 && <0.4 , http-client-tls >=0.3 && <0.4

View file

@ -142,6 +142,7 @@ dependencies:
- transformers >= 0.2.2 - transformers >= 0.2.2
- wai-middleware-metrics - wai-middleware-metrics
- parser-combinators - parser-combinators
- html-entities
# The library contains all of our application code. The executable # The library contains all of our application code. The executable
# defined below is just a thin wrapper. # defined below is just a thin wrapper.

View file

@ -2,7 +2,9 @@ module Handler.Archive where
import Import import Import
import Data.Function ((&)) import Data.Function ((&))
import qualified Data.Attoparsec.ByteString.Char8 as AP import Data.Char (chr)
import qualified Data.Attoparsec.ByteString.Char8 as AP8
import qualified Data.Attoparsec.ByteString as AP
import qualified Data.ByteString as BS import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString.Char8 as BS8 import qualified Data.ByteString.Char8 as BS8
@ -11,6 +13,8 @@ import qualified Network.HTTP.Client.TLS as NH
import qualified Network.HTTP.Types.Status as NH import qualified Network.HTTP.Types.Status as NH
import qualified Web.FormUrlEncoded as WH import qualified Web.FormUrlEncoded as WH
import qualified Control.Monad.Metrics as MM import qualified Control.Monad.Metrics as MM
import HTMLEntities.Decoder (htmlEncodedText)
import Data.Text.Lazy.Builder (toLazyText)
shouldArchiveBookmark :: User -> Key Bookmark -> Handler Bool shouldArchiveBookmark :: User -> Key Bookmark -> Handler Bool
shouldArchiveBookmark user kbid = do shouldArchiveBookmark user kbid = do
@ -88,36 +92,38 @@ _fetchArchiveSubmitInfo = do
res <- liftIO $ NH.httpLbs (buildSimpleRequest "https://archive.li/") =<< NH.getGlobalManager res <- liftIO $ NH.httpLbs (buildSimpleRequest "https://archive.li/") =<< NH.getGlobalManager
MM.increment ("archive.fetchSubmitId_status_" <> (pack.show) (NH.statusCode (NH.responseStatus res))) MM.increment ("archive.fetchSubmitId_status_" <> (pack.show) (NH.statusCode (NH.responseStatus res)))
let body = LBS.toStrict (responseBody res) let body = LBS.toStrict (responseBody res)
action = _parseSubstring (AP.string "action=\"") (AP.notChar '"') body action = _parseSubstring (AP8.string "action=\"") (AP8.notChar '"') body
submitId = _parseSubstring (AP.string "submitid\" value=\"") (AP.notChar '"') body submitId = _parseSubstring (AP8.string "submitid\" value=\"") (AP8.notChar '"') body
pure $ (,) <$> action <*> submitId pure $ (,) <$> action <*> submitId
_archiveUserAgent :: ByteString _archiveUserAgent :: ByteString
_archiveUserAgent = "espial" _archiveUserAgent = "espial"
_parseSubstring :: AP.Parser ByteString -> AP.Parser Char -> BS.ByteString -> Either String String _parseSubstring :: AP8.Parser ByteString -> AP8.Parser Char -> BS.ByteString -> Either String String
_parseSubstring start inner res = do _parseSubstring start inner res = do
(flip AP.parseOnly) res (skipAnyTill start >> AP.many1 inner) (flip AP8.parseOnly) res (skipAnyTill start >> AP8.many1 inner)
where where
skipAnyTill end = go where go = end *> pure () <|> AP.anyChar *> go skipAnyTill end = go where go = end *> pure () <|> AP8.anyChar *> go
fetchPageTitle :: String -> Handler (Either String String) fetchPageTitle :: String -> Handler (Either String Text)
fetchPageTitle url = do fetchPageTitle url =
MM.increment "fetchPageTitle" do MM.increment "fetchPageTitle"
res <- liftIO $ NH.httpLbs (buildSimpleRequest url) =<< NH.getGlobalManager res <- liftIO $ NH.httpLbs (buildSimpleRequest url) =<< NH.getGlobalManager
let body = LBS.toStrict (responseBody res) let body = LBS.toStrict (responseBody res)
title = (flip AP.parseOnly) body $ do pure (decodeHtmlBs <$> parseTitle body)
_ <- skipAnyTill (AP.string "<title") `catch` (\(e :: SomeException) -> do
_ <- skipAnyTill (AP.string ">") MM.increment "fetchPageTitle.error"
AP.many1 (AP.notChar '<') $(logError) $ (pack . show) e
pure title pure (Left (show e)))
`catch` (\(e::SomeException) -> do
MM.increment "fetchPageTitle.error"
$(logError) $ (pack.show) e
pure (Left (show e)))
where where
skipAnyTill end = go where go = end *> pure () <|> AP.anyChar *> go parseTitle bs =
(flip AP.parseOnly) bs $ do
_ <- skipAnyTill (AP.string "<title")
_ <- skipAnyTill (AP.string ">")
AP.takeTill (\w -> chr (fromEnum w) == '<')
decodeHtmlBs = toStrict . toLazyText . htmlEncodedText . decodeUtf8
skipAnyTill end = go where go = end *> pure () <|> AP.anyWord8 *> go
buildSimpleRequest :: String -> Request buildSimpleRequest :: String -> Request
buildSimpleRequest url = buildSimpleRequest url =