txt2pre: eliminate 'Use of uninitialized value ...' warnings
[~bandali/bndl.org] / txt2pre
1 #!/usr/bin/env perl
2 # txt2pre --- convert my site's txt files to `pre'-based atom/rss/html
3
4 # Copyright (C) 2014-2021 all contributors <meta@public-inbox.org>
5 # Copyright (c) 2021 Amin Bandali <bandali@gnu.org>
6 #
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as
9 # published by the Free Software Foundation, either version 3 of the
10 # License, or (at your option) any later version.
11 #
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
16 #
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <https://www.gnu.org/licenses/>.
19
20 # This simple script borrows from a script of the same name from the
21 # wonderful public-inbox project, under AGPLv3+, with additions of
22 # my own.
23
24
25 use strict;
26 use warnings 'all';
27 use Getopt::Long;
28
29 my $format = 'html';
30 my $lang = 'en';
31 my $index = '';
32 my $header = '';
33 my $footer = '';
34
35 GetOptions ('format=s' => \$format,
36 'lang=s' => \$lang,
37 'index' => \$index,
38 'header' => \$header,
39 'footer' => \$footer)
40 or die("bad command line arguments\n");
41
42 my $author =
43 $lang eq 'en' ? 'bandali'
44 : $lang eq 'fa' ? 'بندعلی'
45 : '';
46 my $site_title =
47 $lang eq 'en' ? "${author}'s personal site"
48 : $lang eq 'fa' ? "سایت شخصی $author"
49 : '';
50 my $site_desc =
51 $lang eq 'en' ? "notes and blog posts by $author"
52 : $lang eq 'fa' ? "نوشته‌ها و بلاگ پست‌های $author"
53 : '';
54 my $site_url =
55 ($lang eq 'en') ? 'https://bndl.org'
56 : ($lang eq 'fa') ? 'https://bndl.org/fa/'
57 : '';
58 my $feed_id =
59 ($lang eq 'en') ? "tag:bndl.org,2020:notes.$format"
60 : ($lang eq 'fa') ? "tag:bndl.org,2020:fa/notes.$format"
61 : '';
62
63 my $link_re =
64 qr{([\('!])?\b((?:ftps?|https?|nntps?|imaps?|s?news|gopher)://
65 [\@:\w\.-]+(?:/
66 (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*)
67 (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)?
68 (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)?
69 )?
70 )}xi;
71
72 my %pairs = (
73 "(" => qr/(\)[\.,;\+]?)\z/, # Markdown (,), Ruby (+) (, for arrays)
74 "'" => qr/('[\.,;\+]?)\z/, # Perl / Ruby
75 "!" => qr/(![\.,;\+]?)\z/, # Perl / Ruby
76 );
77
78 my %html_map = (
79 '&' => '&amp;',
80 '<' => '&lt;',
81 '>' => '&gt;',
82 # '"' => '&quot;',
83 # "'" => '&#39;',
84 );
85
86 sub html_esc {
87 my ($s) = @_;
88 $s =~ s/([&<>])/$html_map{$1}/sge;
89 $s;
90 }
91
92 sub linkify {
93 my ($s) = @_;
94 $s =~ s^$link_re^
95 my $beg = $1 || '';
96 my $url = $2;
97 my $end = '';
98
99 # it's fairly common to end URLs in messages with
100 # '.', ',' or ';' to denote the end of a statement;
101 # assume the intent was to end the statement/sentence
102 # in English
103 if (defined(my $re = $pairs{$beg})) {
104 if ($url =~ s/$re//) {
105 $end = $1;
106 }
107 } elsif ($url =~ s/(\))?([\.,;])\z//) {
108 $end = $2;
109 # require ')' to be paired with '('
110 if (defined $1) { # ')'
111 if (index($url, '(') < 0) {
112 $end = ")$end";
113 } else {
114 $url .= ')';
115 }
116 }
117 } elsif ($url !~ /\(/ && $url =~ s/\)\z//) {
118 $end = ')';
119 }
120
121 $beg . "<a href=\"$url\">$url</a>" . $end;
122 ^geo;
123 $s;
124 }
125
126
127 my $out = '';
128
129 # atom/rss feed header and footer
130 if ($index and ($format eq 'atom' or $format eq 'rss')) {
131 if ($header) {
132 my $now_iso8601 = `date -Iseconds -u | tr -d \\\\n`;
133 my $now_rfc5322 = `date -uR | tr -d \\\\n`;
134 my $atom_rel = $format eq 'atom' ? 'self' : 'alternate';
135 my $rss_rel = $format eq 'rss' ? 'self' : 'alternate';
136 my $link = $format eq 'atom' ? 'link' : 'atom:link';
137 my $links = '';
138 if ($lang eq 'en') {
139 $links = qq(
140 <$link hreflang="fa" href="https://bndl.org/fa/notes.atom" rel="alternate" type="application/atom+xml" />
141 <$link hreflang="fa" href="https://bndl.org/fa/notes.rss" rel="alternate" type="application/rss+xml" />
142 <$link hreflang="fa" href="https://bndl.org/fa/bandali.fa.txt" rel="alternate" type="text/plain" />
143 <$link hreflang="fa" href="https://bndl.org/fa/" rel="alternate" type="text/html" />
144 <$link href="https://bndl.org/notes.atom" rel="$atom_rel" type="application/atom+xml" />
145 <$link href="https://bndl.org/notes.rss" rel="$rss_rel" type="application/rss+xml" />
146 <$link href="https://bndl.org/bandali.txt" rel="alternate" type="text/plain" />
147 <$link href="https://bndl.org" rel="alternate" type="text/html" />);
148 } elsif ($lang eq 'fa') {
149 $links = qq(
150 <link hreflang="en" href="https://bndl.org/notes.atom" rel="alternate" type="application/atom+xml" />
151 <link hreflang="en" href="https://bndl.org/notes.rss" rel="alternate" type="application/rss+xml" />
152 <link hreflang="en" href="https://bndl.org/bandali.txt" rel="alternate" type="text/plain" />
153 <link hreflang="en" href="https://bndl.org" rel="alternate" type="text/html" />
154 <link href="https://bndl.org/fa/notes.atom" rel="$atom_rel" type="application/atom+xml" />
155 <link href="https://bndl.org/fa/notes.rss" rel="$rss_rel" type="application/rss+xml" />
156 <link href="https://bndl.org/fa/bandali.fa.txt" rel="alternate" type="text/plain" />
157 <link href="https://bndl.org/fa/" rel="alternate" type="text/html" />);
158 }
159 $links =~ s/^\n//;
160
161 $out .= '<?xml version="1.0" encoding="UTF-8" ?>';
162 $out .= ($format eq 'atom') ? qq(
163 <feed xml:lang="$lang" xmlns="http://www.w3.org/2005/Atom">
164 <title>$site_title</title>
165 <subtitle>$site_desc</subtitle>
166 <id>$feed_id</id>
167 $links
168 <updated>$now_iso8601</updated>)
169 : ($format eq 'rss') ? qq(
170 <rss version="2.0"
171 xmlns:atom="http://www.w3.org/2005/Atom"
172 xmlns:content="http://purl.org/rss/1.0/modules/content/">
173 <channel>
174 <title>$site_title</title>
175 <description>$site_desc</description>
176 <link>$site_url</link>
177 <language>$lang</language>
178 <lastBuildDate>$now_rfc5322</lastBuildDate>
179 <pubDate>$now_rfc5322</pubDate>
180 <ttl>1800</ttl>
181 $links)
182 : '';
183 } elsif ($footer) {
184 $out .= ($format eq 'atom') ? '</feed>'
185 : ($format eq 'rss') ? '</channel></rss>'
186 : '';
187 }
188
189 # we're done
190 goto PRINT;
191 }
192
193
194 my $txt = do { local $/; <STDIN> };
195
196 my $title = html_esc($txt =~ /\A([^\n]+)/);
197 $title =~ s/^\s+|\s+$//g;
198 $title .= " &mdash; $author" if $title !~ /$author/;
199
200 my ($upd, $pub, $url) = $txt =~ /(.*)\r?\n(.*)\r?\n(.*)\r?\n?\z/;
201 ($upd) = $upd =~ /(?:updated|ویرایش): (.*)/ if $upd;
202 ($pub) = $pub =~ /(?:published|انتشار): (.*)/ if $pub;
203 ($url) = $url =~ /(?:plain text|متن ساده): (.*)/ if $url;
204 $url = 'https://bndl.org/bandali-cv.txt'
205 if (!$url and $title =~ /curriculum vitae/);
206 $url = html_esc($url) if $url;
207
208 $txt = linkify(html_esc($txt));
209
210
211 my $upd_iso8601 = `date -Iseconds -ud '$upd' | tr -d \\\\n` if $upd;
212 my $pub_iso8601 = `date -Iseconds -ud '$pub' | tr -d \\\\n` if $pub;
213 my $pub_rfc5322 = `date -uRd '$pub' | tr -d \\\\n` if $pub;
214 my $url_html = $url =~ s/(?:[.]$lang)?[.]txt$/.html/r if $url;
215 $url_html =~ s|/bandali-(.*)|/$1| if $url_html;
216 my $slug = $url_html =~ s|.*/(.*)[.]html$|$1|r if $url_html;
217 my $note_id = "$feed_id:$slug" if $url_html;
218
219 # note header
220 if ($format eq 'html') {
221 $out .=
222 '<!doctype html>'
223 . qq(<html lang="$lang") . ($lang eq 'fa'
224 ? ' dir="rtl"'
225 : '' . '>')
226 . qq(<head>
227 <meta http-equiv="Content-Type"
228 content="text/html; charset=utf-8" />\n)
229 . "<title>$title</title>\n"
230 . qq(<link rel="icon" href="data:,">\n)
231 . ($url
232 ? qq(<link rel="alternate" href="$url"
233 title="plain text" type="text/plain" />\n)
234 : '')
235 . (($index and $lang eq 'en')
236 ? qq(<link rel="alternate" href="https://bndl.org/fa/"
237 hreflang="fa" title="persian" />\n)
238 : ($index and $lang eq 'fa')
239 ? qq(<link rel="alternate" href="https://bndl.org/"
240 hreflang="en" title="english" />\n)
241 : '')
242 . qq(<style>\@media(prefers-color-scheme:dark){
243 body{background:#1c1c1c;color:white;}a:link{color:#acdeff;}
244 a:visited{color:#f8f;}a:active{color:#e00;}})
245 . ($lang eq 'fa'
246 ? qq(\n\@font-face{font-family:sahel;font-weight:normal;
247 src:local('Sahel WOL'),local('Sahel'),
248 url('sahel.woff2')format('woff2');}pre{font-family:sahel})
249 : '')
250 . "</style>\n"
251 . '</head><body><pre>';
252 } elsif ($format eq 'atom' or $format eq 'rss') {
253 my $atom_updated =
254 ($format eq 'atom') ? 'updated'
255 : ($format eq 'rss') ? 'atom:updated'
256 : '';
257 my $updated =
258 "<$atom_updated>$upd_iso8601</$atom_updated>\n" if $upd;
259 $out .= ($format eq 'atom') ? qq(
260 <entry xml:base="$site_url">
261 <author><name>$author</name></author>
262 <id>$note_id</id>
263 <published>$pub_iso8601</published>\n)
264 . ($updated ? $updated : '') .
265 qq(<link href="$url" rel="alternate" type="text/plain" />
266 <link href="$url_html" rel="alternate" type="text/html" />
267 <title>$title</title>
268 <content type="html"><![CDATA[<pre>)
269 : ($format eq 'rss') ? qq(
270 <item>
271 <title>$title</title>
272 <link>$url_html</link>
273 <guid isPermaLink="false">$note_id</guid>
274 <pubDate>$pub_rfc5322</pubDate>\n)
275 . ($updated ? $updated : '') .
276 qq(<content:encoded><![CDATA[<pre>)
277 : '';
278 }
279 # note body
280 $out .= $txt;
281 # note footer
282 if ($format eq 'html') {
283 $out .= '</pre></body></html>';
284 } elsif ($format eq 'atom') {
285 $out .= "</pre>]]></content></entry>";
286 } elsif ($format eq 'rss') {
287 $out .= "</pre>]]></content:encoded></item>";
288 }
289
290 PRINT:
291 print("$out\n");
292 STDOUT->flush;