espial/src/Handler/Archive.hs

132 lines
4.9 KiB
Haskell
Raw Normal View History

2019-01-31 02:54:47 +00:00
module Handler.Archive where
import Import
import Data.Function ((&))
2020-01-19 07:22:02 +00:00
import Data.Char (ord)
2020-01-19 06:47:53 +00:00
import qualified Data.Attoparsec.ByteString.Char8 as AP8
import qualified Data.Attoparsec.ByteString as AP
2019-01-31 02:54:47 +00:00
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString.Char8 as BS8
import qualified Network.HTTP.Client as NH
import qualified Network.HTTP.Client.TLS as NH
import qualified Network.HTTP.Types.Status as NH
import qualified Web.FormUrlEncoded as WH
import qualified Control.Monad.Metrics as MM
2020-01-19 06:47:53 +00:00
import HTMLEntities.Decoder (htmlEncodedText)
import Data.Text.Lazy.Builder (toLazyText)
2019-01-31 02:54:47 +00:00
shouldArchiveBookmark :: User -> Key Bookmark -> Handler Bool
shouldArchiveBookmark user kbid = do
runDB (get kbid) >>= \case
Nothing -> pure False
Just bm -> do
pure $
(isNothing $ bookmarkArchiveHref bm) &&
(bookmarkShared bm)
&& not (_isArchiveBlacklisted bm)
&& userArchiveDefault user
archiveBookmarkUrl :: Key Bookmark -> String -> Handler ()
archiveBookmarkUrl kbid url =
(_fetchArchiveSubmitInfo >>= \case
Left e -> do
MM.increment "archive.fetchSubmitId_noparse"
$(logError) (pack e)
Right submitInfo -> do
userId <- requireAuthId
let req = _buildArchiveSubmitRequest submitInfo url
MM.increment "archive.submit"
res <- liftIO $ NH.httpLbs req =<< NH.getGlobalManager
let status = NH.responseStatus res
MM.increment ("archive.submit_status_" <> (pack.show) (NH.statusCode status))
let updateArchiveUrl = runDB . updateBookmarkArchiveUrl userId kbid . Just
headers = NH.responseHeaders res
case status of
s | s == NH.status200 ->
for_ (lookup "Refresh" headers >>= _parseRefreshHeaderUrl) updateArchiveUrl
2019-10-27 21:01:10 +00:00
s | s == NH.status302 || s == NH.status307 ->
2019-01-31 02:54:47 +00:00
for_ (lookup "Location" headers) (updateArchiveUrl . decodeUtf8)
_ -> $(logError) (pack (show res)))
`catch` (\(e::SomeException) -> ($(logError) $ (pack.show) e) >> throwIO e)
_isArchiveBlacklisted :: Bookmark -> Bool
_isArchiveBlacklisted (Bookmark {..}) =
[ "hulu"
, "livestream"
, "netflix"
, "skillsmatter"
, "twitch.tv"
, "vimeo"
, "youtu.be"
, "youtube"
, "archive."
] &
any (`isInfixOf` bookmarkHref)
_parseRefreshHeaderUrl :: ByteString -> Maybe Text
_parseRefreshHeaderUrl h = do
let u = BS8.drop 1 $ BS8.dropWhile (/= '=') h
if (not (null u))
then Just $ decodeUtf8 u
else Nothing
_buildArchiveSubmitRequest :: (String, String) -> String -> NH.Request
_buildArchiveSubmitRequest (action, submitId) href =
NH.parseRequest_ ("POST " <> action) & \r ->
r { NH.requestHeaders =
[ ("User-Agent", _archiveUserAgent)
, ("Content-Type", "application/x-www-form-urlencoded")
]
, NH.requestBody = NH.RequestBodyLBS $ WH.urlEncodeAsForm ((
[ ("submitid" , submitId)
, ("url", href)
]) :: [(String, String)])
, NH.redirectCount = 0
}
_fetchArchiveSubmitInfo :: Handler (Either String (String , String))
_fetchArchiveSubmitInfo = do
MM.increment "archive.fetchSubmitId"
res <- liftIO $ NH.httpLbs (buildSimpleRequest "https://archive.li/") =<< NH.getGlobalManager
2019-01-31 02:54:47 +00:00
MM.increment ("archive.fetchSubmitId_status_" <> (pack.show) (NH.statusCode (NH.responseStatus res)))
let body = LBS.toStrict (responseBody res)
2020-01-19 06:47:53 +00:00
action = _parseSubstring (AP8.string "action=\"") (AP8.notChar '"') body
submitId = _parseSubstring (AP8.string "submitid\" value=\"") (AP8.notChar '"') body
2019-01-31 02:54:47 +00:00
pure $ (,) <$> action <*> submitId
_archiveUserAgent :: ByteString
_archiveUserAgent = "espial"
2020-01-19 06:47:53 +00:00
_parseSubstring :: AP8.Parser ByteString -> AP8.Parser Char -> BS.ByteString -> Either String String
2019-01-31 02:54:47 +00:00
_parseSubstring start inner res = do
2020-01-19 06:47:53 +00:00
(flip AP8.parseOnly) res (skipAnyTill start >> AP8.many1 inner)
2019-01-31 02:54:47 +00:00
where
2020-01-19 06:47:53 +00:00
skipAnyTill end = go where go = end *> pure () <|> AP8.anyChar *> go
2020-01-19 06:47:53 +00:00
fetchPageTitle :: String -> Handler (Either String Text)
fetchPageTitle url =
do MM.increment "fetchPageTitle"
res <- liftIO $ NH.httpLbs (buildSimpleRequest url) =<< NH.getGlobalManager
let body = LBS.toStrict (responseBody res)
pure (decodeHtmlBs <$> parseTitle body)
`catch` (\(e :: SomeException) -> do
MM.increment "fetchPageTitle.error"
$(logError) $ (pack . show) e
pure (Left (show e)))
where
2020-01-19 06:47:53 +00:00
parseTitle bs =
(flip AP.parseOnly) bs $ do
_ <- skipAnyTill (AP.string "<title")
_ <- skipAnyTill (AP.string ">")
2020-01-19 07:22:02 +00:00
let lt = toEnum (ord '<')
AP.takeTill (== lt)
2020-01-19 06:47:53 +00:00
decodeHtmlBs = toStrict . toLazyText . htmlEncodedText . decodeUtf8
skipAnyTill end = go where go = end *> pure () <|> AP.anyWord8 *> go
buildSimpleRequest :: String -> Request
buildSimpleRequest url =
NH.parseRequest_ url & \r ->
r {NH.requestHeaders = [("User-Agent", _archiveUserAgent)]}