git-annex in bandali@jirud:~/src/site
[~bandali/bndl.org] / txt2pre
1 #!/usr/bin/env perl
2 # txt2pre --- convert my site's txt files to `pre'-based atom/rss/html
3
4 # Copyright (C) 2014-2021 all contributors <meta@public-inbox.org>
5 # Copyright (c) 2021 bandali <bandali@gnu.org>
6 #
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
11 #
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
16 #
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <https://www.gnu.org/licenses/>.
19
20 # This simple script borrows from a script of the same name from the
21 # wonderful public-inbox project, under AGPLv3+, with additions of
22 # my own.
23
24 # Update (2021-11-01): this script isn't currently used for generating
25 # my site's pages anymore; but kept for future reference.
26
27
28 use strict;
29 use warnings 'all';
30 use Getopt::Long;
31
32 my $format = 'html';
33 my $lang = 'en';
34 my $index = '';
35 my $header = '';
36 my $footer = '';
37
38 GetOptions ('format=s' => \$format,
39 'lang=s' => \$lang,
40 'index' => \$index,
41 'header' => \$header,
42 'footer' => \$footer)
43 or die("bad command line arguments\n");
44
45 my $author =
46 $lang eq 'en' ? 'bandali'
47 : $lang eq 'fa' ? 'بندعلی'
48 : '';
49 my $site_title =
50 $lang eq 'en' ? "${author}'s personal site"
51 : $lang eq 'fa' ? "سایت شخصی $author"
52 : '';
53 my $site_desc =
54 $lang eq 'en' ? "notes and blog posts by $author"
55 : $lang eq 'fa' ? "نوشته‌ها و بلاگ پست‌های $author"
56 : '';
57 my $site_url =
58 ($lang eq 'en') ? 'https://bndl.org'
59 : ($lang eq 'fa') ? 'https://bndl.org/fa/'
60 : '';
61 my $feed_id =
62 ($lang eq 'en') ? "tag:bndl.org,2020:notes.$format"
63 : ($lang eq 'fa') ? "tag:bndl.org,2020:fa/notes.$format"
64 : '';
65
66 my $link_re =
67 qr{([\('!])?\b((?:ftps?|https?|nntps?|imaps?|s?news|gopher)://
68 [\@:\w\.-]+(?:/
69 (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*)
70 (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)?
71 (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)?
72 )?
73 )}xi;
74
75 my %pairs = (
76 "(" => qr/(\)[\.,;\+]?)\z/, # Markdown (,), Ruby (+) (, for arrays)
77 "'" => qr/('[\.,;\+]?)\z/, # Perl / Ruby
78 "!" => qr/(![\.,;\+]?)\z/, # Perl / Ruby
79 );
80
81 my %html_map = (
82 '&' => '&amp;',
83 '<' => '&lt;',
84 '>' => '&gt;',
85 # '"' => '&quot;',
86 # "'" => '&#39;',
87 );
88
89 sub html_esc {
90 my ($s) = @_;
91 $s =~ s/([&<>])/$html_map{$1}/sge;
92 $s;
93 }
94
95 sub linkify {
96 my ($s) = @_;
97 $s =~ s^$link_re^
98 my $beg = $1 || '';
99 my $url = $2;
100 my $end = '';
101
102 # it's fairly common to end URLs in messages with
103 # '.', ',' or ';' to denote the end of a statement;
104 # assume the intent was to end the statement/sentence
105 # in English
106 if (defined(my $re = $pairs{$beg})) {
107 if ($url =~ s/$re//) {
108 $end = $1;
109 }
110 } elsif ($url =~ s/(\))?([\.,;])\z//) {
111 $end = $2;
112 # require ')' to be paired with '('
113 if (defined $1) { # ')'
114 if (index($url, '(') < 0) {
115 $end = ")$end";
116 } else {
117 $url .= ')';
118 }
119 }
120 } elsif ($url !~ /\(/ && $url =~ s/\)\z//) {
121 $end = ')';
122 }
123
124 $beg . "<a href=\"$url\">$url</a>" . $end;
125 ^geo;
126 $s;
127 }
128
129
130 my $out = '';
131
132 # atom/rss feed header and footer
133 if ($index and ($format eq 'atom' or $format eq 'rss')) {
134 if ($header) {
135 my $now_iso8601 = `date -Iseconds -u | tr -d \\\\n`;
136 my $now_rfc5322 = `date -uR | tr -d \\\\n`;
137 my $atom_rel = $format eq 'atom' ? 'self' : 'alternate';
138 my $rss_rel = $format eq 'rss' ? 'self' : 'alternate';
139 my $link = $format eq 'atom' ? 'link' : 'atom:link';
140 my $links = '';
141 if ($lang eq 'en') {
142 $links = qq(
143 <$link hreflang="fa" href="https://bndl.org/fa/notes.atom" rel="alternate" type="application/atom+xml" />
144 <$link hreflang="fa" href="https://bndl.org/fa/notes.rss" rel="alternate" type="application/rss+xml" />
145 <$link hreflang="fa" href="https://bndl.org/fa/bandali.fa.txt" rel="alternate" type="text/plain" />
146 <$link hreflang="fa" href="https://bndl.org/fa/" rel="alternate" type="text/html" />
147 <$link href="https://bndl.org/notes.atom" rel="$atom_rel" type="application/atom+xml" />
148 <$link href="https://bndl.org/notes.rss" rel="$rss_rel" type="application/rss+xml" />
149 <$link href="https://bndl.org/bandali.txt" rel="alternate" type="text/plain" />
150 <$link href="https://bndl.org" rel="alternate" type="text/html" />);
151 } elsif ($lang eq 'fa') {
152 $links = qq(
153 <link hreflang="en" href="https://bndl.org/notes.atom" rel="alternate" type="application/atom+xml" />
154 <link hreflang="en" href="https://bndl.org/notes.rss" rel="alternate" type="application/rss+xml" />
155 <link hreflang="en" href="https://bndl.org/bandali.txt" rel="alternate" type="text/plain" />
156 <link hreflang="en" href="https://bndl.org" rel="alternate" type="text/html" />
157 <link href="https://bndl.org/fa/notes.atom" rel="$atom_rel" type="application/atom+xml" />
158 <link href="https://bndl.org/fa/notes.rss" rel="$rss_rel" type="application/rss+xml" />
159 <link href="https://bndl.org/fa/bandali.fa.txt" rel="alternate" type="text/plain" />
160 <link href="https://bndl.org/fa/" rel="alternate" type="text/html" />);
161 }
162 $links =~ s/^\n//;
163
164 $out .= '<?xml version="1.0" encoding="UTF-8" ?>';
165 $out .= ($format eq 'atom') ? qq(
166 <feed xml:lang="$lang" xmlns="http://www.w3.org/2005/Atom">
167 <title>$site_title</title>
168 <subtitle>$site_desc</subtitle>
169 <id>$feed_id</id>
170 $links
171 <updated>$now_iso8601</updated>)
172 : ($format eq 'rss') ? qq(
173 <rss version="2.0"
174 xmlns:atom="http://www.w3.org/2005/Atom"
175 xmlns:content="http://purl.org/rss/1.0/modules/content/">
176 <channel>
177 <title>$site_title</title>
178 <description>$site_desc</description>
179 <link>$site_url</link>
180 <language>$lang</language>
181 <lastBuildDate>$now_rfc5322</lastBuildDate>
182 <pubDate>$now_rfc5322</pubDate>
183 <ttl>1800</ttl>
184 $links)
185 : '';
186 } elsif ($footer) {
187 $out .= ($format eq 'atom') ? '</feed>'
188 : ($format eq 'rss') ? '</channel></rss>'
189 : '';
190 }
191
192 # we're done
193 goto PRINT;
194 }
195
196
197 my $txt = do { local $/; <STDIN> };
198
199 my $title = html_esc($txt =~ /\A([^\n]+)/);
200 $title =~ s/^\s+|\s+$//g;
201 $title .= " &#8212; $author" if $title !~ /$author/;
202
203 my ($upd, $pub, $url) = $txt =~ /(.*)\r?\n(.*)\r?\n(.*)\r?\n?\z/;
204 ($upd) = $upd =~ /(?:updated|ویرایش): (.*)/ if $upd;
205 ($pub) = $pub =~ /(?:published|انتشار): (.*)/ if $pub;
206 $upd = $pub if (!$upd);
207 ($url) = $url =~ /(?:plain text|متن ساده): (.*)/ if $url;
208 $url = 'https://bndl.org/bandali-cv.txt'
209 if (!$url and $title =~ /curriculum vitae/);
210 $url = html_esc($url) if $url;
211
212 $txt = linkify(html_esc($txt));
213
214
215 my $upd_iso8601 = `date -Iseconds -ud '$upd' | tr -d \\\\n` if $upd;
216 my $pub_iso8601 = `date -Iseconds -ud '$pub' | tr -d \\\\n` if $pub;
217 my $pub_rfc5322 = `date -uRd '$pub' | tr -d \\\\n` if $pub;
218 my $url_html = $url =~ s/(?:[.]$lang)?[.]txt$/.html/r if $url;
219 $url_html =~ s|/bandali-(.*)|/$1| if $url_html;
220 my $slug = $url_html =~ s|.*/(.*)[.]html$|$1|r if $url_html;
221 my $note_id = "$feed_id:$slug" if $url_html;
222
223 # note header
224 if ($format eq 'html') {
225 $out .=
226 '<!doctype html>'
227 . qq(<html lang="$lang") . ($lang eq 'fa'
228 ? ' dir="rtl"'
229 : '' . '>')
230 . qq(<head>
231 <meta http-equiv="Content-Type"
232 content="text/html; charset=utf-8" />\n)
233 . "<title>$title</title>\n"
234 . qq(<link rel="icon" href="data:,">\n)
235 . ($url
236 ? qq(<link rel="alternate" href="$url"
237 title="plain text" type="text/plain" />\n)
238 : '')
239 . (($index and $lang eq 'en')
240 ? qq(<link rel="alternate" href="https://bndl.org/fa/"
241 hreflang="fa" title="persian" />\n)
242 : ($index and $lang eq 'fa')
243 ? qq(<link rel="alternate" href="https://bndl.org/"
244 hreflang="en" title="english" />\n)
245 : '')
246 . qq(<style>\@media(prefers-color-scheme:dark){
247 body{background:#1c1c1c;color:white;}a:link{color:#acdeff;}
248 a:visited{color:#f8f;}a:active{color:#e00;}})
249 . ($lang eq 'fa'
250 ? qq(\n\@font-face{font-family:sahel;font-weight:normal;
251 src:local('Sahel WOL'),local('Sahel'),
252 url('sahel.woff2')format('woff2');}pre{font-family:sahel})
253 : '')
254 . "</style>\n"
255 . '</head><body><pre>';
256 } elsif ($format eq 'atom' or $format eq 'rss') {
257 my $atom_updated =
258 ($format eq 'atom') ? 'updated'
259 : ($format eq 'rss') ? 'atom:updated'
260 : '';
261 my $updated =
262 "<$atom_updated>$upd_iso8601</$atom_updated>\n" if $upd;
263 $out .= ($format eq 'atom') ? qq(
264 <entry xml:base="$site_url">
265 <author><name>$author</name></author>
266 <id>$note_id</id>
267 <published>$pub_iso8601</published>\n)
268 . ($updated ? $updated : '') .
269 qq(<link href="$url" rel="alternate" type="text/plain" />
270 <link href="$url_html" rel="alternate" type="text/html" />
271 <title>$title</title>
272 <content type="html"><![CDATA[<pre>)
273 : ($format eq 'rss') ? qq(
274 <item>
275 <title>$title</title>
276 <link>$url_html</link>
277 <guid isPermaLink="false">$note_id</guid>
278 <pubDate>$pub_rfc5322</pubDate>\n)
279 . (($updated and $pub ne $upd) ? $updated : '') .
280 qq(<content:encoded><![CDATA[<pre>)
281 : '';
282 }
283 # note body
284 $out .= $txt;
285 # note footer
286 if ($format eq 'html') {
287 $out .= '</pre></body></html>';
288 } elsif ($format eq 'atom') {
289 $out .= "</pre>]]></content></entry>";
290 } elsif ($format eq 'rss') {
291 $out .= "</pre>]]></content:encoded></item>";
292 }
293
294 PRINT:
295 print("$out\n");
296 STDOUT->flush;