update/fix urls throughout
[~bandali/bndl.org] / txt2pre
CommitLineData
d3adcff4 1#!/usr/bin/env perl
22e69f33 2# txt2pre --- convert my site's txt files to `pre'-based atom/rss/html
d3adcff4
AB
3
4# Copyright (C) 2014-2021 all contributors <meta@public-inbox.org>
7808cfc7 5# Copyright (c) 2021 bandali <bandali@gnu.org>
d3adcff4
AB
6#
7# This program is free software: you can redistribute it and/or modify
8# it under the terms of the GNU Affero General Public License as
9# published by the Free Software Foundation, either version 3 of the
10# License, or (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU Affero General Public License for more details.
16#
17# You should have received a copy of the GNU Affero General Public License
18# along with this program. If not, see <https://www.gnu.org/licenses/>.
19
4ce760d0
AB
20# This simple script borrows from a script of the same name from the
21# wonderful public-inbox project, under AGPLv3+, with additions of
22# my own.
d3adcff4 23
7808cfc7
AB
24# Update (2021-11-01): this script isn't currently used for generating
25# my site's pages anymore; but kept for future reference.
26
d3adcff4
AB
27
28use strict;
29use warnings 'all';
e02deb23
AB
30use Getopt::Long;
31
32469801
AB
32my $format = 'html';
33my $lang = 'en';
34my $index = '';
35my $header = '';
36my $footer = '';
37
38GetOptions ('format=s' => \$format,
39 'lang=s' => \$lang,
40 'index' => \$index,
41 'header' => \$header,
42 'footer' => \$footer)
e02deb23 43 or die("bad command line arguments\n");
d3adcff4 44
32469801
AB
45my $author =
46 $lang eq 'en' ? 'bandali'
47 : $lang eq 'fa' ? 'بندعلی'
48 : '';
49my $site_title =
50 $lang eq 'en' ? "${author}'s personal site"
51 : $lang eq 'fa' ? "سایت شخصی $author"
52 : '';
53my $site_desc =
54 $lang eq 'en' ? "notes and blog posts by $author"
55 : $lang eq 'fa' ? "نوشته‌ها و بلاگ پست‌های $author"
56 : '';
57my $site_url =
58 ($lang eq 'en') ? 'https://bndl.org'
59 : ($lang eq 'fa') ? 'https://bndl.org/fa/'
60 : '';
61my $feed_id =
62 ($lang eq 'en') ? "tag:bndl.org,2020:notes.$format"
63 : ($lang eq 'fa') ? "tag:bndl.org,2020:fa/notes.$format"
64 : '';
65
d3adcff4
AB
66my $link_re =
67 qr{([\('!])?\b((?:ftps?|https?|nntps?|imaps?|s?news|gopher)://
68 [\@:\w\.-]+(?:/
69 (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*)
70 (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)?
71 (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)?
72 )?
73 )}xi;
74
75my %pairs = (
76 "(" => qr/(\)[\.,;\+]?)\z/, # Markdown (,), Ruby (+) (, for arrays)
77 "'" => qr/('[\.,;\+]?)\z/, # Perl / Ruby
78 "!" => qr/(![\.,;\+]?)\z/, # Perl / Ruby
79);
80
81my %html_map = (
82 '&' => '&amp;',
83 '<' => '&lt;',
84 '>' => '&gt;',
85 # '"' => '&quot;',
86 # "'" => '&#39;',
87);
88
89sub html_esc {
90 my ($s) = @_;
91 $s =~ s/([&<>])/$html_map{$1}/sge;
92 $s;
93}
94
95sub linkify {
96 my ($s) = @_;
97 $s =~ s^$link_re^
98 my $beg = $1 || '';
99 my $url = $2;
100 my $end = '';
101
102 # it's fairly common to end URLs in messages with
103 # '.', ',' or ';' to denote the end of a statement;
104 # assume the intent was to end the statement/sentence
105 # in English
106 if (defined(my $re = $pairs{$beg})) {
107 if ($url =~ s/$re//) {
108 $end = $1;
109 }
110 } elsif ($url =~ s/(\))?([\.,;])\z//) {
111 $end = $2;
112 # require ')' to be paired with '('
113 if (defined $1) { # ')'
114 if (index($url, '(') < 0) {
115 $end = ")$end";
116 } else {
117 $url .= ')';
118 }
119 }
120 } elsif ($url !~ /\(/ && $url =~ s/\)\z//) {
121 $end = ')';
122 }
123
124 $beg . "<a href=\"$url\">$url</a>" . $end;
125 ^geo;
126 $s;
127}
128
129
32469801
AB
130my $out = '';
131
132# atom/rss feed header and footer
133if ($index and ($format eq 'atom' or $format eq 'rss')) {
134 if ($header) {
135 my $now_iso8601 = `date -Iseconds -u | tr -d \\\\n`;
136 my $now_rfc5322 = `date -uR | tr -d \\\\n`;
137 my $atom_rel = $format eq 'atom' ? 'self' : 'alternate';
138 my $rss_rel = $format eq 'rss' ? 'self' : 'alternate';
139 my $link = $format eq 'atom' ? 'link' : 'atom:link';
140 my $links = '';
141 if ($lang eq 'en') {
142 $links = qq(
143<$link hreflang="fa" href="https://bndl.org/fa/notes.atom" rel="alternate" type="application/atom+xml" />
144<$link hreflang="fa" href="https://bndl.org/fa/notes.rss" rel="alternate" type="application/rss+xml" />
145<$link hreflang="fa" href="https://bndl.org/fa/bandali.fa.txt" rel="alternate" type="text/plain" />
146<$link hreflang="fa" href="https://bndl.org/fa/" rel="alternate" type="text/html" />
147<$link href="https://bndl.org/notes.atom" rel="$atom_rel" type="application/atom+xml" />
148<$link href="https://bndl.org/notes.rss" rel="$rss_rel" type="application/rss+xml" />
149<$link href="https://bndl.org/bandali.txt" rel="alternate" type="text/plain" />
150<$link href="https://bndl.org" rel="alternate" type="text/html" />);
151 } elsif ($lang eq 'fa') {
152 $links = qq(
153<link hreflang="en" href="https://bndl.org/notes.atom" rel="alternate" type="application/atom+xml" />
154<link hreflang="en" href="https://bndl.org/notes.rss" rel="alternate" type="application/rss+xml" />
155<link hreflang="en" href="https://bndl.org/bandali.txt" rel="alternate" type="text/plain" />
156<link hreflang="en" href="https://bndl.org" rel="alternate" type="text/html" />
157<link href="https://bndl.org/fa/notes.atom" rel="$atom_rel" type="application/atom+xml" />
158<link href="https://bndl.org/fa/notes.rss" rel="$rss_rel" type="application/rss+xml" />
159<link href="https://bndl.org/fa/bandali.fa.txt" rel="alternate" type="text/plain" />
160<link href="https://bndl.org/fa/" rel="alternate" type="text/html" />);
161 }
162 $links =~ s/^\n//;
163
164 $out .= '<?xml version="1.0" encoding="UTF-8" ?>';
165 $out .= ($format eq 'atom') ? qq(
166<feed xml:lang="$lang" xmlns="http://www.w3.org/2005/Atom">
167<title>$site_title</title>
168<subtitle>$site_desc</subtitle>
169<id>$feed_id</id>
170$links
171<updated>$now_iso8601</updated>)
172 : ($format eq 'rss') ? qq(
173<rss version="2.0"
174 xmlns:atom="http://www.w3.org/2005/Atom"
175 xmlns:content="http://purl.org/rss/1.0/modules/content/">
176<channel>
177<title>$site_title</title>
178<description>$site_desc</description>
179<link>$site_url</link>
180<language>$lang</language>
181<lastBuildDate>$now_rfc5322</lastBuildDate>
182<pubDate>$now_rfc5322</pubDate>
183<ttl>1800</ttl>
184$links)
185 : '';
186 } elsif ($footer) {
187 $out .= ($format eq 'atom') ? '</feed>'
188 : ($format eq 'rss') ? '</channel></rss>'
189 : '';
190 }
191
192 # we're done
193 goto PRINT;
194}
195
196
d3adcff4 197my $txt = do { local $/; <STDIN> };
212ec2e6 198
d3adcff4
AB
199my $title = html_esc($txt =~ /\A([^\n]+)/);
200$title =~ s/^\s+|\s+$//g;
d29d6cb9 201$title .= " &#8212; $author" if $title !~ /$author/;
212ec2e6
AB
202
203my ($upd, $pub, $url) = $txt =~ /(.*)\r?\n(.*)\r?\n(.*)\r?\n?\z/;
204($upd) = $upd =~ /(?:updated|ویرایش): (.*)/ if $upd;
205($pub) = $pub =~ /(?:published|انتشار): (.*)/ if $pub;
2e543551 206$upd = $pub if (!$upd);
212ec2e6 207($url) = $url =~ /(?:plain text|متن ساده): (.*)/ if $url;
ce2fcdbc
AB
208$url = 'https://bndl.org/bandali-cv.txt'
209 if (!$url and $title =~ /curriculum vitae/);
210$url = html_esc($url) if $url;
d3adcff4
AB
211
212$txt = linkify(html_esc($txt));
213
32469801
AB
214
215my $upd_iso8601 = `date -Iseconds -ud '$upd' | tr -d \\\\n` if $upd;
216my $pub_iso8601 = `date -Iseconds -ud '$pub' | tr -d \\\\n` if $pub;
217my $pub_rfc5322 = `date -uRd '$pub' | tr -d \\\\n` if $pub;
22e69f33
AB
218my $url_html = $url =~ s/(?:[.]$lang)?[.]txt$/.html/r if $url;
219$url_html =~ s|/bandali-(.*)|/$1| if $url_html;
220my $slug = $url_html =~ s|.*/(.*)[.]html$|$1|r if $url_html;
221my $note_id = "$feed_id:$slug" if $url_html;
32469801
AB
222
223# note header
224if ($format eq 'html') {
225 $out .=
226 '<!doctype html>'
227 . qq(<html lang="$lang") . ($lang eq 'fa'
228 ? ' dir="rtl"'
229 : '' . '>')
230 . qq(<head>
ce2fcdbc 231<meta http-equiv="Content-Type"
32469801
AB
232content="text/html; charset=utf-8" />\n)
233 . "<title>$title</title>\n"
234 . qq(<link rel="icon" href="data:,">\n)
235 . ($url
236 ? qq(<link rel="alternate" href="$url"
237title="plain text" type="text/plain" />\n)
238 : '')
239 . (($index and $lang eq 'en')
240 ? qq(<link rel="alternate" href="https://bndl.org/fa/"
ce2fcdbc 241hreflang="fa" title="persian" />\n)
32469801
AB
242 : ($index and $lang eq 'fa')
243 ? qq(<link rel="alternate" href="https://bndl.org/"
ce2fcdbc 244hreflang="en" title="english" />\n)
32469801
AB
245 : '')
246 . qq(<style>\@media(prefers-color-scheme:dark){
19580079 247body{background:#1c1c1c;color:white;}a:link{color:#acdeff;}
32469801
AB
248a:visited{color:#f8f;}a:active{color:#e00;}})
249 . ($lang eq 'fa'
250 ? qq(\n\@font-face{font-family:sahel;font-weight:normal;
d3adcff4 251src:local('Sahel WOL'),local('Sahel'),
19580079 252url('sahel.woff2')format('woff2');}pre{font-family:sahel})
32469801
AB
253 : '')
254 . "</style>\n"
255 . '</head><body><pre>';
256} elsif ($format eq 'atom' or $format eq 'rss') {
257 my $atom_updated =
258 ($format eq 'atom') ? 'updated'
259 : ($format eq 'rss') ? 'atom:updated'
260 : '';
261 my $updated =
262 "<$atom_updated>$upd_iso8601</$atom_updated>\n" if $upd;
263 $out .= ($format eq 'atom') ? qq(
264<entry xml:base="$site_url">
265<author><name>$author</name></author>
266<id>$note_id</id>
267<published>$pub_iso8601</published>\n)
268. ($updated ? $updated : '') .
269qq(<link href="$url" rel="alternate" type="text/plain" />
270<link href="$url_html" rel="alternate" type="text/html" />
271<title>$title</title>
272<content type="html"><![CDATA[<pre>)
273 : ($format eq 'rss') ? qq(
274<item>
275<title>$title</title>
276<link>$url_html</link>
277<guid isPermaLink="false">$note_id</guid>
278<pubDate>$pub_rfc5322</pubDate>\n)
2e543551 279. (($updated and $pub ne $upd) ? $updated : '') .
32469801
AB
280qq(<content:encoded><![CDATA[<pre>)
281 : '';
282}
283# note body
284$out .= $txt;
285# note footer
286if ($format eq 'html') {
287 $out .= '</pre></body></html>';
288} elsif ($format eq 'atom') {
289 $out .= "</pre>]]></content></entry>";
290} elsif ($format eq 'rss') {
291 $out .= "</pre>]]></content:encoded></item>";
292}
293
294 PRINT:
295print("$out\n");
d3adcff4 296STDOUT->flush;