espial/src/Handler/Archive.hs
2024-02-09 23:44:01 +01:00

148 lines
5.4 KiB
Haskell

module Handler.Archive where
import Import
import Data.Function ((&))
import Data.Char (ord)
import qualified Data.Attoparsec.ByteString.Char8 as AP8
import qualified Data.Attoparsec.ByteString as AP
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString.Char8 as BS8
import qualified Network.HTTP.Client as NH
import qualified Network.HTTP.Client.TLS as NH
import qualified Network.HTTP.Types.Status as NH
import qualified Web.FormUrlEncoded as WH
-- import qualified Control.Monad.Metrics as MM
import HTMLEntities.Decoder (htmlEncodedText)
import Data.Text.Lazy.Builder (toLazyText)
import Network.Wai (requestHeaderHost)
shouldArchiveBookmark :: User -> Key Bookmark -> Handler Bool
shouldArchiveBookmark user kbid = do
runDB (get kbid) >>= \case
Nothing -> pure False
Just bm -> do
pure $
(isNothing $ bookmarkArchiveHref bm) &&
(bookmarkShared bm)
&& not (_isArchiveBlacklisted bm)
&& userArchiveDefault user
archiveBookmarkUrl :: Key Bookmark -> String -> Handler ()
archiveBookmarkUrl kbid url =
(_fetchArchiveSubmitInfo >>= \case
Left e -> do
-- MM.increment "archive.fetchSubmitId_noparse"
$(logError) (pack e)
Right submitInfo -> do
userId <- requireAuthId
req <- _buildArchiveSubmitRequest submitInfo url
-- MM.increment "archive.submit"
res <- liftIO $ NH.httpLbs req =<< NH.getGlobalManager
let status = NH.responseStatus res
-- MM.increment ("archive.submit_status_" <> (pack.show) (NH.statusCode status))
let updateArchiveUrl = runDB . updateBookmarkArchiveUrl userId kbid . Just
headers = NH.responseHeaders res
case status of
s | s == NH.status200 ->
for_ (lookup "Refresh" headers >>= _parseRefreshHeaderUrl) updateArchiveUrl
s | s == NH.status302 || s == NH.status307 ->
for_ (lookup "Location" headers) (updateArchiveUrl . decodeUtf8)
_ -> $(logError) (pack (show res)))
`catch` (\(e::SomeException) -> ($(logError) $ (pack.show) e) >> throwIO e)
_isArchiveBlacklisted :: Bookmark -> Bool
_isArchiveBlacklisted (Bookmark {..}) =
[ "hulu"
, "livestream"
, "netflix"
, "skillsmatter"
, "twitch.tv"
, "vimeo"
, "youtu.be"
, "youtube"
, "archive."
] &
any (`isInfixOf` bookmarkHref)
_parseRefreshHeaderUrl :: ByteString -> Maybe Text
_parseRefreshHeaderUrl h = do
let u = BS8.drop 1 $ BS8.dropWhile (/= '=') h
if (not (null u))
then Just $ decodeUtf8 u
else Nothing
_fetchArchiveSubmitInfo :: Handler (Either String (String , String))
_fetchArchiveSubmitInfo = do
-- MM.increment "archive.fetchSubmitId"
req <- buildRequest "https://archive.li/"
res <- liftIO $ NH.httpLbs req =<< NH.getGlobalManager
-- MM.increment ("archive.fetchSubmitId_status_" <> (pack.show) (NH.statusCode (NH.responseStatus res)))
let body = LBS.toStrict (responseBody res)
action = _parseSubstring (AP8.string "action=\"") (AP8.notChar '"') body
submitId = _parseSubstring (AP8.string "submitid\" value=\"") (AP8.notChar '"') body
if statusCode (responseStatus res) == 200
then pure $ (,) <$> action <*> submitId
else pure $ Left $ "Invalid statusCode: " <> show (responseStatus res)
_parseSubstring :: AP8.Parser ByteString -> AP8.Parser Char -> BS.ByteString -> Either String String
_parseSubstring start inner res = do
(flip AP8.parseOnly) res (skipAnyTill start >> AP8.many1 inner)
where
skipAnyTill end = go where go = end *> pure () <|> AP8.anyChar *> go
fetchPageTitle :: String -> Handler (Either String Text)
fetchPageTitle url =
do
-- MM.increment "fetchPageTitle"
req <- buildRequest url
res <- liftIO $ NH.httpLbs req =<< NH.getGlobalManager
let body = LBS.toStrict (responseBody res)
pure (decodeHtmlBs <$> parseTitle body)
`catch` (\(e :: SomeException) -> do
-- MM.increment "fetchPageTitle.error"
$(logError) $ (pack . show) e
pure (Left (show e)))
where
parseTitle bs =
(flip AP.parseOnly) bs do
_ <- skipAnyTill (AP.string "<title")
_ <- skipAnyTill (AP.string ">")
let lt = toEnum (ord '<')
AP.takeTill (== lt)
decodeHtmlBs = toStrict . toLazyText . htmlEncodedText . decodeUtf8
skipAnyTill end = go where go = end *> pure () <|> AP.anyWord8 *> go
_buildArchiveSubmitRequest :: (String, String) -> String -> Handler NH.Request
_buildArchiveSubmitRequest (action, submitId) href = do
req <- buildRequest ("POST " <> action)
pure $ req
{ NH.requestHeaders = ("Content-Type", "application/x-www-form-urlencoded") : NH.requestHeaders req
, NH.requestBody =
NH.RequestBodyLBS $
WH.urlEncodeAsForm
(([("submitid", submitId), ("url", href)]) :: [(String, String)])
, NH.redirectCount = 0
}
buildRequest :: String -> Handler Request
buildRequest url = do
ua <- _archiveUserAgent
pure $ NH.parseRequest_ url & \r ->
r { NH.requestHeaders =
[ ("Cache-Control", "max-age=0")
, ("Connection", "keep-alive")
, ("DNT", "1")
, ("Host", "archive.li")
, ("Upgrade-Insecure-Requests", "1")
, ("User-Agent", ua)
]
}
_archiveUserAgent :: Handler ByteString
_archiveUserAgent = do
mHost <- pure . requestHeaderHost . reqWaiRequest =<< getRequest
pure $ ("espial-" <>) (maybe "" (BS8.takeWhile (/= ':')) mHost)