Improved RSS generation
This commit is contained in:
parent
29d1945cc0
commit
22a2d2bd6c
6 changed files with 267 additions and 66 deletions
111
mkrss.sh
111
mkrss.sh
|
@ -1,53 +1,58 @@
|
|||
#!/usr/bin/env nix-shell
|
||||
#!nix-shell -i zsh
|
||||
|
||||
rsstpl="rss.tpl"
|
||||
# Directory
|
||||
webdir="_site"
|
||||
postsdir="$webdir/posts"
|
||||
rssfile="$webdir/rss.xml"
|
||||
|
||||
xmlize() {
|
||||
local fic="$1";
|
||||
hxclean $fic
|
||||
}
|
||||
# maximal number of articles to put in the RSS file
|
||||
maxarticles=10
|
||||
|
||||
# RSS Metas
|
||||
rsstitle="her.esy.fun"
|
||||
rssurl="https://her.esy.fun/rss.xml"
|
||||
websiteurl="https://her.esy.fun"
|
||||
rssdescription="her.esy.fun articles, mostly random personal thoughts"
|
||||
rsslang="en"
|
||||
rssauthor="yann@esposito.host (Yann Esposito)"
|
||||
rssimgtitle="yogsototh"
|
||||
rssimgurl="https://her.esy.fun/img/FlatAvatar.png"
|
||||
|
||||
# HTML Accessors (similar to CSS accessors)
|
||||
dateaccessor='.article-date'
|
||||
contentaccessor='#content'
|
||||
# title and keyword shouldn't be changed
|
||||
titleaccessor='title'
|
||||
keywordsaccessor='meta[name=keywords]::attr(content)'
|
||||
|
||||
formatdate() {
|
||||
# format the date for RSS
|
||||
local d=$1
|
||||
LC_TIME=en_US date --date $d +'%a, %d %b %Y %H:%M:%S %z'
|
||||
}
|
||||
finddate(){
|
||||
local fic="$1"
|
||||
cat $fic | hxselect -c '.article-date'
|
||||
}
|
||||
findtitle(){
|
||||
local fic="$1"
|
||||
cat $fic | hxselect -c 'h1'
|
||||
}
|
||||
getcontent(){
|
||||
local fic="$1"
|
||||
cat $fic | hxselect '#content'
|
||||
}
|
||||
findkeywords(){
|
||||
local fic="$1"
|
||||
cat $fic | hxselect -c '.keywords > code' | sed 's/,//g'
|
||||
}
|
||||
|
||||
finddate(){ < $1 hxselect -c $dateaccessor }
|
||||
findtitle(){ < $1 hxselect -c $titleaccessor }
|
||||
getcontent(){ < $1 hxselect $contentaccessor }
|
||||
findkeywords(){ < $1 hxselect -c $keywordsaccessor | sed 's/,//g' }
|
||||
mkcategories(){
|
||||
for keyword in $*; do
|
||||
printf "\\n<category>%s</category>" $keyword
|
||||
done
|
||||
}
|
||||
|
||||
realname="Yann Esposito"
|
||||
website="https://her.esy.fun"
|
||||
|
||||
autoload -U colors && colors
|
||||
|
||||
tmpdir=$(mktemp -d)
|
||||
typeset -a dates
|
||||
dates=( )
|
||||
for fic in $postsdir/**/*.html; do
|
||||
printf "%-30s" $(echo "$fic"|sed 's#^'$postsdir'/##')
|
||||
blogfile="$(echo "$fic"|sed 's#^'$postsdir'/##')"
|
||||
printf "%-30s" $blogfile
|
||||
xfic="$tmpdir/$fic.xml"
|
||||
mkdir -p $(dirname $xfic)
|
||||
xmlize $fic > $xfic
|
||||
hxclean $fic > $xfic
|
||||
d=$(finddate $xfic)
|
||||
echo -n " [$d]"
|
||||
rssdate=$(formatdate $d)
|
||||
|
@ -55,15 +60,59 @@ for fic in $postsdir/**/*.html; do
|
|||
keywords=( $(findkeywords $xfic) )
|
||||
printf ": %-55s" "$title ($keywords)"
|
||||
categories=$(mkcategories $keywords)
|
||||
blogfile="$(echo $fic | perl -pe 's#.*?/posts/#/posts/#')"
|
||||
printf "\\n<item>\\n<title>%s</title>\\n<guid>%s%s</guid>\\n<pubDate>%s</pubDate>%s\\n<description><![CDATA[\\n%s\\n]]></description>\\n</item>\\n\\n" "$title" "$website" "$blogfile" "$rssdate" "$categories" "$(getcontent "$xfic")" >> "$tmpdir/${d}-$(basename $fic).rss"
|
||||
{ printf "\\n<item>"
|
||||
printf "\\n<title>%s</title>" "$title"
|
||||
printf "\\n<guid>%s</guid>" "${websiteurl}/${blogfile}"
|
||||
printf "\\n<pubDate>%s</pubDate>%s" "$rssdate"
|
||||
printf "%s" "$categories"
|
||||
printf "\\n<description><![CDATA[\\n%s\\n]]></description>" "$(getcontent "$xfic")"
|
||||
printf "\\n</item>\\n\\n"
|
||||
} >> "$tmpdir/${d}-$(basename $fic).rss"
|
||||
dates=( $d $dates )
|
||||
echo " [${fg[green]}OK${reset_color}]"
|
||||
done
|
||||
for fic in $(ls $tmpdir/*.rss | sort -r); do
|
||||
# echo $fic
|
||||
echo "Publishing"
|
||||
for fic in $(ls $tmpdir/*.rss | sort -r | head -n $maxarticles ); do
|
||||
echo "${fic:t}"
|
||||
cat $fic >> $tmpdir/rss
|
||||
done
|
||||
|
||||
sed "/<!-- LB -->/r $tmpdir/rss" "$rsstpl" > "$rssfile"
|
||||
rssmaxdate=$(formatdate $(for d in $dates; do echo $d; done | sort -r | head -n 1))
|
||||
rssbuilddate=$(formatdate $(date))
|
||||
{
|
||||
cat <<END
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<rss version="2.0"
|
||||
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
||||
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:atom="http://www.w3.org/2005/Atom"
|
||||
xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
|
||||
xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
|
||||
xmlns:georss="http://www.georss.org/georss"
|
||||
xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#"
|
||||
xmlns:media="http://search.yahoo.com/mrss/"><channel>
|
||||
<title>${rsstitle}</title>
|
||||
<atom:link href="${rssurl}" rel="self" type="application/rss+xml" />
|
||||
<link>${websiteurl}</link>
|
||||
<description><![CDATA[${rssdescription}]]></description>
|
||||
<language>${rsslang}</language>
|
||||
<pubDate>${rssmaxdate}</pubDate>
|
||||
<lastBuildDate>$rssbuilddate</lastBuildDate>
|
||||
<generator>mkrss.sh</generator>
|
||||
<webMaster>${rssauthor}</webMaster>
|
||||
<image>
|
||||
<url>${rssimgurl}</url>
|
||||
<title>${rssimgtitle}</title>
|
||||
<link>${websiteurl}</link>
|
||||
</image>
|
||||
END
|
||||
cat $tmpdir/rss
|
||||
cat <<END
|
||||
</channel>
|
||||
</rss>
|
||||
END
|
||||
} > "$rssfile"
|
||||
|
||||
rm -rf $tmpdir
|
||||
echo "RSS Generated"
|
||||
|
|
|
@ -227,7 +227,7 @@ Return output file name."
|
|||
(setq org-publish-project-alist
|
||||
`(("orgfiles"
|
||||
:base-directory ,base-dir
|
||||
:exclude ".*drafts/.*\\|.*/rss.*"
|
||||
:exclude ".*drafts/.*"
|
||||
:base-extension "org"
|
||||
:publishing-directory ,publish-dir
|
||||
:recursive t
|
||||
|
|
BIN
project.el.sig
BIN
project.el.sig
Binary file not shown.
30
rss.tpl
30
rss.tpl
|
@ -1,30 +0,0 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<rss version="2.0"
|
||||
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
||||
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:atom="http://www.w3.org/2005/Atom"
|
||||
xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
|
||||
xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
|
||||
xmlns:georss="http://www.georss.org/georss"
|
||||
xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#"
|
||||
xmlns:media="http://search.yahoo.com/mrss/"><channel>
|
||||
<title>her.esy.fun</title>
|
||||
<atom:link href="https://her.esy.fun/rss.xml" rel="self" type="application/rss+xml" />
|
||||
<link>https://her.esy.fun</link>
|
||||
<description><![CDATA[her.esy.fun articles, mostly random personal thoughts]]></description>
|
||||
<language>en</language>
|
||||
<pubDate>Mon, 23 Sep 2019 09:59:16 +0200</pubDate>
|
||||
<lastBuildDate>Mon, 23 Sep 2019 09:59:16 +0200</lastBuildDate>
|
||||
<generator>Emacs 26.3 Org-mode 9.2.5</generator>
|
||||
<webMaster>yann@esposito.host (Yann Esposito)</webMaster>
|
||||
<image>
|
||||
<url>https://her.esy.fun/img/FlatAvatar.png</url>
|
||||
<title>her.esy.fun</title>
|
||||
<link>https://her.esy.fun</link>
|
||||
</image>
|
||||
|
||||
<!-- LB -->
|
||||
|
||||
</channel>
|
||||
</rss>
|
|
@ -2,7 +2,8 @@
|
|||
#+AUTHOR: Yann Esposito
|
||||
#+EMAIL: yann@esposito.host
|
||||
#+DESCRIPTION: Articles
|
||||
- [2019-09-23] *[[file:/Users/esposito/dev/her.esy.fun/src/posts/how-i-internet.org][How I Internet]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#blog@@html:</span>@@ @@html:<span class="keyword">@@#minimalism@@html:</span>@@ @@html:<span class="keyword">@@#self-hosting@@html:</span>@@ @@html:<span class="keyword">@@#web@@html:</span>@@ @@html:<span class="keyword">@@#zen@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@How I protect myself against attention grabbers and many social media anti-patterns.@@html:</div>@@
|
||||
- [2019-08-18] *[[file:/Users/esposito/dev/her.esy.fun/src/posts/project-el/index.org][Autoload Script by project]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#blog@@html:</span>@@ @@html:<span class="keyword">@@#org-mode@@html:</span>@@ @@html:<span class="keyword">@@#programming@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@A script I use to load safely an eLISP file when entering a new project directory.@@html:</div>@@
|
||||
- [2019-08-17] *[[file:/Users/esposito/dev/her.esy.fun/src/posts/troll-2/index.org][Troll 2]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#movie@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@I watched what may be the worse movie of all time and I still enjoyed greatly the show.@@html:</div>@@
|
||||
- [2019-08-17] *[[file:/Users/esposito/dev/her.esy.fun/src/posts/new-blog.org][New Blog]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#blog@@html:</span>@@ @@html:<span class="keyword">@@#css@@html:</span>@@ @@html:<span class="keyword">@@#org-mode@@html:</span>@@ @@html:<span class="keyword">@@#programming@@html:</span>@@ @@html:<span class="keyword">@@#web@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@Meta article about how I generate this blog.@@html:</div>@@
|
||||
- [2019-09-30] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/rss-gen.org][RSS Generation]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#programming@@html:</span>@@ @@html:<span class="keyword">@@#web@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@How I generate RSS feed via command line@@html:</div>@@
|
||||
- [2019-09-23] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/how-i-internet.org][How I Internet]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#blog@@html:</span>@@ @@html:<span class="keyword">@@#minimalism@@html:</span>@@ @@html:<span class="keyword">@@#self-hosting@@html:</span>@@ @@html:<span class="keyword">@@#web@@html:</span>@@ @@html:<span class="keyword">@@#zen@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@How I protect myself against attention grabbers and many social media anti-patterns.@@html:</div>@@
|
||||
- [2019-08-18] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/project-el/index.org][Autoload Script by project]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#blog@@html:</span>@@ @@html:<span class="keyword">@@#org-mode@@html:</span>@@ @@html:<span class="keyword">@@#programming@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@A script I use to load safely an eLISP file when entering a new project directory.@@html:</div>@@
|
||||
- [2019-08-17] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/troll-2/index.org][Troll 2]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#movie@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@I watched what may be the worse movie of all time and I still enjoyed greatly the show.@@html:</div>@@
|
||||
- [2019-08-17] *[[file:/Users/yaesposi/y/her.esy.fun/src/posts/new-blog.org][New Blog]]* @@html:<div class="keywords">@@@@html:<span class="keyword">@@#blog@@html:</span>@@ @@html:<span class="keyword">@@#css@@html:</span>@@ @@html:<span class="keyword">@@#org-mode@@html:</span>@@ @@html:<span class="keyword">@@#programming@@html:</span>@@ @@html:<span class="keyword">@@#web@@html:</span>@@@@html:</div>@@@@html:<div class="description">@@Meta article about how I generate this blog.@@html:</div>@@
|
181
src/posts/rss-gen.org
Normal file
181
src/posts/rss-gen.org
Normal file
|
@ -0,0 +1,181 @@
|
|||
#+TITLE: RSS Generation
|
||||
#+SUBTITLE: How to generate RSS feed via command line
|
||||
#+AUTHOR: Yann Esposito
|
||||
#+EMAIL: yann@esposito.host
|
||||
#+DATE: [2019-09-30 Mon]
|
||||
#+KEYWORDS: programming, web
|
||||
#+DESCRIPTION: How I generate RSS feed via command line
|
||||
#+OPTIONS: auto-id:t
|
||||
|
||||
#+begin_notes
|
||||
TL;DR: To generate an RSS file you need to provide many metadatas.
|
||||
Those metadata are not part of all HTML files.
|
||||
So generating RSS from a tree of HTML file is not straightforward.
|
||||
Here is the script I use.
|
||||
#+end_notes
|
||||
|
||||
* RSS Problem
|
||||
:PROPERTIES:
|
||||
:CUSTOM_ID: rss-problem
|
||||
:END:
|
||||
|
||||
RSS feed is meant to declare updates and new articles for a website.
|
||||
Each RSS entry must therefore have a date, an unique id, a title, maybe
|
||||
some categories, etc...
|
||||
|
||||
For most blog platform or even static website generator, those meta infos
|
||||
are clearly put in the sources or in some DB.
|
||||
|
||||
I use =org-mode= for generating my website, and the =ox-rss= is quite slow
|
||||
when generating an RSS with the full content of each item.
|
||||
Mainly, the way to achieve full content of my articles inside an RSS with
|
||||
=ox-rss= is by first creating a very big org file containing all the
|
||||
articles, and then transforming it in RSS. And this is very slow (many minutes).
|
||||
|
||||
So a simpler idea inspired by lb[fn:lb] is to generate the RSS directly
|
||||
from the generated HTML files.
|
||||
The only difficulty is to find the metadata inside those HTML.
|
||||
Unfortunately there is no real standard for all those metas.
|
||||
|
||||
Has there is no standard place to have all those meta informations inside
|
||||
an HTML file in order to use the HTML as source you'll need to "parse" the
|
||||
HTML file.
|
||||
For that purpose I use =html-xml-utils=.
|
||||
|
||||
Here is the full script I use
|
||||
|
||||
#+begin_src bash
|
||||
#!/usr/bin/env nix-shell
|
||||
#!nix-shell -i zsh
|
||||
|
||||
# Directory
|
||||
webdir="_site"
|
||||
postsdir="$webdir/posts"
|
||||
rssfile="$webdir/rss.xml"
|
||||
|
||||
# maximal number of articles to put in the RSS file
|
||||
maxarticles=10
|
||||
|
||||
# RSS Metas
|
||||
rsstitle="her.esy.fun"
|
||||
rssurl="https://her.esy.fun/rss.xml"
|
||||
websiteurl="https://her.esy.fun"
|
||||
rssdescription="her.esy.fun articles, mostly random personal thoughts"
|
||||
rsslang="en"
|
||||
rssauthor="yann@esposito.host (Yann Esposito)"
|
||||
rssimgtitle="yogsototh"
|
||||
rssimgurl="https://her.esy.fun/img/FlatAvatar.png"
|
||||
|
||||
# HTML Accessors (similar to CSS accessors)
|
||||
dateaccessor='.article-date'
|
||||
contentaccessor='#content'
|
||||
# title and keyword shouldn't be changed
|
||||
titleaccessor='title'
|
||||
keywordsaccessor='meta[name=keywords]::attr(content)'
|
||||
|
||||
formatdate() {
|
||||
# format the date for RSS
|
||||
local d=$1
|
||||
LC_TIME=en_US date --date $d +'%a, %d %b %Y %H:%M:%S %z'
|
||||
}
|
||||
|
||||
finddate(){ < $1 hxselect -c $dateaccessor }
|
||||
findtitle(){ < $1 hxselect -c $titleaccessor }
|
||||
getcontent(){ < $1 hxselect $contentaccessor }
|
||||
findkeywords(){ < $1 hxselect -c $keywordsaccessor | sed 's/,//g' }
|
||||
mkcategories(){
|
||||
for keyword in $*; do
|
||||
printf "\\n<category>%s</category>" $keyword
|
||||
done
|
||||
}
|
||||
|
||||
autoload -U colors && colors
|
||||
|
||||
tmpdir=$(mktemp -d)
|
||||
typeset -a dates
|
||||
dates=( )
|
||||
for fic in $postsdir/**/*.html; do
|
||||
blogfile="$(echo "$fic"|sed 's#^'$postsdir'/##')"
|
||||
printf "%-30s" $blogfile
|
||||
xfic="$tmpdir/$fic.xml"
|
||||
mkdir -p $(dirname $xfic)
|
||||
hxclean $fic > $xfic
|
||||
d=$(finddate $xfic)
|
||||
echo -n " [$d]"
|
||||
rssdate=$(formatdate $d)
|
||||
title=$(findtitle $xfic)
|
||||
keywords=( $(findkeywords $xfic) )
|
||||
printf ": %-55s" "$title ($keywords)"
|
||||
categories=$(mkcategories $keywords)
|
||||
{ printf "\\n<item>"
|
||||
printf "\\n<title>%s</title>" "$title"
|
||||
printf "\\n<guid>%s</guid>" "${websiteurl}/${blogfile}"
|
||||
printf "\\n<pubDate>%s</pubDate>%s" "$rssdate"
|
||||
printf "%s" "$categories"
|
||||
printf "\\n<description><![CDATA[\\n%s\\n]]></description>" "$(getcontent "$xfic")"
|
||||
printf "\\n</item>\\n\\n"
|
||||
} >> "$tmpdir/${d}-$(basename $fic).rss"
|
||||
dates=( $d $dates )
|
||||
echo " [${fg[green]}OK${reset_color}]"
|
||||
done
|
||||
echo "Publishing"
|
||||
for fic in $(ls $tmpdir/*.rss | sort -r | head -n $maxarticles ); do
|
||||
echo "${fic:t}"
|
||||
cat $fic >> $tmpdir/rss
|
||||
done
|
||||
|
||||
rssmaxdate=$(formatdate $(for d in $dates; do echo $d; done | sort -r | head -n 1))
|
||||
rssbuilddate=$(formatdate $(date))
|
||||
{
|
||||
cat <<END
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<rss version="2.0"
|
||||
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
||||
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:atom="http://www.w3.org/2005/Atom"
|
||||
xmlns:sy="http://purl.org/rss/1.0/modules/syndication/"
|
||||
xmlns:slash="http://purl.org/rss/1.0/modules/slash/"
|
||||
xmlns:georss="http://www.georss.org/georss"
|
||||
xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#"
|
||||
xmlns:media="http://search.yahoo.com/mrss/"><channel>
|
||||
<title>${rsstitle}</title>
|
||||
<atom:link href="${rssurl}" rel="self" type="application/rss+xml" />
|
||||
<link>${websiteurl}</link>
|
||||
<description><![CDATA[${rssdescription}]]></description>
|
||||
<language>${rsslang}</language>
|
||||
<pubDate>${rssmaxdate}</pubDate>
|
||||
<lastBuildDate>$rssbuilddate</lastBuildDate>
|
||||
<generator>mkrss.sh</generator>
|
||||
<webMaster>${rssauthor}</webMaster>
|
||||
<image>
|
||||
<url>${rssimgurl}</url>
|
||||
<title>${rssimgtitle}</title>
|
||||
<link>${websiteurl}</link>
|
||||
</image>
|
||||
END
|
||||
cat $tmpdir/rss
|
||||
cat <<END
|
||||
</channel>
|
||||
</rss>
|
||||
END
|
||||
} > "$rssfile"
|
||||
|
||||
rm -rf $tmpdir
|
||||
echo "RSS Generated"
|
||||
#+end_src
|
||||
|
||||
The =nix-shell= bang pattern is a neat trick to have all the dependencies I
|
||||
need when running my script, I could have added zsh, but my main concern
|
||||
was about =html-xml-utils=.
|
||||
|
||||
Along my script I have a =shell.nix= file containing:
|
||||
|
||||
#+begin_src nix
|
||||
{ pkgs ? import (fetchTarball https://github.com/NixOS/nixpkgs/archive/19.09-beta.tar.gz) {} }:
|
||||
pkgs.mkShell {
|
||||
buildInputs = [ pkgs.html-xml-utils ];
|
||||
}
|
||||
#+end_src
|
||||
|
||||
[fn:lb] https://github.com/LukeSmithxyz/lb
|
Loading…
Reference in a new issue