espial/src/Handler/Archive.hs

149 lines
5.4 KiB
Haskell
Raw Normal View History

2019-01-31 02:54:47 +00:00
module Handler.Archive where
import Import
import Data.Function ((&))
2020-01-19 07:22:02 +00:00
import Data.Char (ord)
2020-01-19 06:47:53 +00:00
import qualified Data.Attoparsec.ByteString.Char8 as AP8
import qualified Data.Attoparsec.ByteString as AP
2019-01-31 02:54:47 +00:00
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as LBS
import qualified Data.ByteString.Char8 as BS8
import qualified Network.HTTP.Client as NH
import qualified Network.HTTP.Client.TLS as NH
import qualified Network.HTTP.Types.Status as NH
import qualified Web.FormUrlEncoded as WH
2020-03-29 23:49:28 +00:00
-- import qualified Control.Monad.Metrics as MM
2020-01-19 06:47:53 +00:00
import HTMLEntities.Decoder (htmlEncodedText)
import Data.Text.Lazy.Builder (toLazyText)
2020-07-20 17:43:03 +00:00
import Network.Wai (requestHeaderHost)
2019-01-31 02:54:47 +00:00
shouldArchiveBookmark :: User -> Key Bookmark -> Handler Bool
shouldArchiveBookmark user kbid = do
runDB (get kbid) >>= \case
Nothing -> pure False
Just bm -> do
pure $
(isNothing $ bookmarkArchiveHref bm) &&
(bookmarkShared bm)
&& not (_isArchiveBlacklisted bm)
&& userArchiveDefault user
archiveBookmarkUrl :: Key Bookmark -> String -> Handler ()
archiveBookmarkUrl kbid url =
(_fetchArchiveSubmitInfo >>= \case
Left e -> do
2020-03-29 23:49:28 +00:00
-- MM.increment "archive.fetchSubmitId_noparse"
2019-01-31 02:54:47 +00:00
$(logError) (pack e)
Right submitInfo -> do
userId <- requireAuthId
2020-07-20 17:43:03 +00:00
req <- _buildArchiveSubmitRequest submitInfo url
2020-03-29 23:49:28 +00:00
-- MM.increment "archive.submit"
2019-01-31 02:54:47 +00:00
res <- liftIO $ NH.httpLbs req =<< NH.getGlobalManager
let status = NH.responseStatus res
2020-03-29 23:49:28 +00:00
-- MM.increment ("archive.submit_status_" <> (pack.show) (NH.statusCode status))
2019-01-31 02:54:47 +00:00
let updateArchiveUrl = runDB . updateBookmarkArchiveUrl userId kbid . Just
headers = NH.responseHeaders res
case status of
s | s == NH.status200 ->
for_ (lookup "Refresh" headers >>= _parseRefreshHeaderUrl) updateArchiveUrl
2019-10-27 21:01:10 +00:00
s | s == NH.status302 || s == NH.status307 ->
2019-01-31 02:54:47 +00:00
for_ (lookup "Location" headers) (updateArchiveUrl . decodeUtf8)
_ -> $(logError) (pack (show res)))
`catch` (\(e::SomeException) -> ($(logError) $ (pack.show) e) >> throwIO e)
_isArchiveBlacklisted :: Bookmark -> Bool
_isArchiveBlacklisted (Bookmark {..}) =
[ "hulu"
, "livestream"
, "netflix"
, "skillsmatter"
, "twitch.tv"
, "vimeo"
, "youtu.be"
, "youtube"
, "archive."
] &
any (`isInfixOf` bookmarkHref)
_parseRefreshHeaderUrl :: ByteString -> Maybe Text
_parseRefreshHeaderUrl h = do
let u = BS8.drop 1 $ BS8.dropWhile (/= '=') h
if (not (null u))
then Just $ decodeUtf8 u
else Nothing
_fetchArchiveSubmitInfo :: Handler (Either String (String , String))
_fetchArchiveSubmitInfo = do
2020-03-29 23:49:28 +00:00
-- MM.increment "archive.fetchSubmitId"
2020-07-20 17:43:03 +00:00
req <- buildRequest "https://archive.li/"
res <- liftIO $ NH.httpLbs req =<< NH.getGlobalManager
2020-03-29 23:49:28 +00:00
-- MM.increment ("archive.fetchSubmitId_status_" <> (pack.show) (NH.statusCode (NH.responseStatus res)))
2019-01-31 02:54:47 +00:00
let body = LBS.toStrict (responseBody res)
2020-01-19 06:47:53 +00:00
action = _parseSubstring (AP8.string "action=\"") (AP8.notChar '"') body
submitId = _parseSubstring (AP8.string "submitid\" value=\"") (AP8.notChar '"') body
2020-07-20 17:43:03 +00:00
if statusCode (responseStatus res) == 200
then pure $ (,) <$> action <*> submitId
else pure $ Left $ "Invalid statusCode: " <> show (responseStatus res)
2019-01-31 02:54:47 +00:00
2020-01-19 06:47:53 +00:00
_parseSubstring :: AP8.Parser ByteString -> AP8.Parser Char -> BS.ByteString -> Either String String
2019-01-31 02:54:47 +00:00
_parseSubstring start inner res = do
2020-01-19 06:47:53 +00:00
(flip AP8.parseOnly) res (skipAnyTill start >> AP8.many1 inner)
2019-01-31 02:54:47 +00:00
where
2020-01-19 06:47:53 +00:00
skipAnyTill end = go where go = end *> pure () <|> AP8.anyChar *> go
2020-01-19 06:47:53 +00:00
fetchPageTitle :: String -> Handler (Either String Text)
fetchPageTitle url =
2020-03-29 23:49:28 +00:00
do
-- MM.increment "fetchPageTitle"
2020-07-20 17:43:03 +00:00
req <- buildRequest url
res <- liftIO $ NH.httpLbs req =<< NH.getGlobalManager
2020-01-19 06:47:53 +00:00
let body = LBS.toStrict (responseBody res)
pure (decodeHtmlBs <$> parseTitle body)
`catch` (\(e :: SomeException) -> do
2020-03-29 23:49:28 +00:00
-- MM.increment "fetchPageTitle.error"
2020-01-19 06:47:53 +00:00
$(logError) $ (pack . show) e
pure (Left (show e)))
where
2020-01-19 06:47:53 +00:00
parseTitle bs =
2020-07-20 17:43:03 +00:00
(flip AP.parseOnly) bs do
2020-01-19 06:47:53 +00:00
_ <- skipAnyTill (AP.string "<title")
_ <- skipAnyTill (AP.string ">")
2020-01-19 07:22:02 +00:00
let lt = toEnum (ord '<')
AP.takeTill (== lt)
2020-01-19 06:47:53 +00:00
decodeHtmlBs = toStrict . toLazyText . htmlEncodedText . decodeUtf8
skipAnyTill end = go where go = end *> pure () <|> AP.anyWord8 *> go
2020-07-20 17:43:03 +00:00
_buildArchiveSubmitRequest :: (String, String) -> String -> Handler NH.Request
_buildArchiveSubmitRequest (action, submitId) href = do
req <- buildRequest ("POST " <> action)
pure $ req
{ NH.requestHeaders = ("Content-Type", "application/x-www-form-urlencoded") : NH.requestHeaders req
, NH.requestBody =
NH.RequestBodyLBS $
WH.urlEncodeAsForm
(([("submitid", submitId), ("url", href)]) :: [(String, String)])
, NH.redirectCount = 0
}
buildRequest :: String -> Handler Request
buildRequest url = do
ua <- _archiveUserAgent
pure $ NH.parseRequest_ url & \r ->
r { NH.requestHeaders =
[ ("Cache-Control", "max-age=0")
, ("Connection", "keep-alive")
, ("DNT", "1")
, ("Host", "archive.li")
, ("Upgrade-Insecure-Requests", "1")
, ("User-Agent", ua)
]
}
_archiveUserAgent :: Handler ByteString
_archiveUserAgent = do
mHost <- pure . requestHeaderHost . reqWaiRequest =<< getRequest
pure $ ("espial-" <>) (maybe "" (BS8.takeWhile (/= ':')) mHost)