txt2pre: use '—' instead of '—' (to fix feed validation)
[~bandali/bndl.org] / txt2pre
CommitLineData
d3adcff4 1#!/usr/bin/env perl
22e69f33 2# txt2pre --- convert my site's txt files to `pre'-based atom/rss/html
d3adcff4
AB
3
4# Copyright (C) 2014-2021 all contributors <meta@public-inbox.org>
5# Copyright (c) 2021 Amin Bandali <bandali@gnu.org>
6#
7# This program is free software: you can redistribute it and/or modify
8# it under the terms of the GNU Affero General Public License as
9# published by the Free Software Foundation, either version 3 of the
10# License, or (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU Affero General Public License for more details.
16#
17# You should have received a copy of the GNU Affero General Public License
18# along with this program. If not, see <https://www.gnu.org/licenses/>.
19
4ce760d0
AB
20# This simple script borrows from a script of the same name from the
21# wonderful public-inbox project, under AGPLv3+, with additions of
22# my own.
d3adcff4
AB
23
24
25use strict;
26use warnings 'all';
e02deb23
AB
27use Getopt::Long;
28
32469801
AB
29my $format = 'html';
30my $lang = 'en';
31my $index = '';
32my $header = '';
33my $footer = '';
34
35GetOptions ('format=s' => \$format,
36 'lang=s' => \$lang,
37 'index' => \$index,
38 'header' => \$header,
39 'footer' => \$footer)
e02deb23 40 or die("bad command line arguments\n");
d3adcff4 41
32469801
AB
42my $author =
43 $lang eq 'en' ? 'bandali'
44 : $lang eq 'fa' ? 'بندعلی'
45 : '';
46my $site_title =
47 $lang eq 'en' ? "${author}'s personal site"
48 : $lang eq 'fa' ? "سایت شخصی $author"
49 : '';
50my $site_desc =
51 $lang eq 'en' ? "notes and blog posts by $author"
52 : $lang eq 'fa' ? "نوشته‌ها و بلاگ پست‌های $author"
53 : '';
54my $site_url =
55 ($lang eq 'en') ? 'https://bndl.org'
56 : ($lang eq 'fa') ? 'https://bndl.org/fa/'
57 : '';
58my $feed_id =
59 ($lang eq 'en') ? "tag:bndl.org,2020:notes.$format"
60 : ($lang eq 'fa') ? "tag:bndl.org,2020:fa/notes.$format"
61 : '';
62
d3adcff4
AB
63my $link_re =
64 qr{([\('!])?\b((?:ftps?|https?|nntps?|imaps?|s?news|gopher)://
65 [\@:\w\.-]+(?:/
66 (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*)
67 (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)?
68 (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)?
69 )?
70 )}xi;
71
72my %pairs = (
73 "(" => qr/(\)[\.,;\+]?)\z/, # Markdown (,), Ruby (+) (, for arrays)
74 "'" => qr/('[\.,;\+]?)\z/, # Perl / Ruby
75 "!" => qr/(![\.,;\+]?)\z/, # Perl / Ruby
76);
77
78my %html_map = (
79 '&' => '&amp;',
80 '<' => '&lt;',
81 '>' => '&gt;',
82 # '"' => '&quot;',
83 # "'" => '&#39;',
84);
85
86sub html_esc {
87 my ($s) = @_;
88 $s =~ s/([&<>])/$html_map{$1}/sge;
89 $s;
90}
91
92sub linkify {
93 my ($s) = @_;
94 $s =~ s^$link_re^
95 my $beg = $1 || '';
96 my $url = $2;
97 my $end = '';
98
99 # it's fairly common to end URLs in messages with
100 # '.', ',' or ';' to denote the end of a statement;
101 # assume the intent was to end the statement/sentence
102 # in English
103 if (defined(my $re = $pairs{$beg})) {
104 if ($url =~ s/$re//) {
105 $end = $1;
106 }
107 } elsif ($url =~ s/(\))?([\.,;])\z//) {
108 $end = $2;
109 # require ')' to be paired with '('
110 if (defined $1) { # ')'
111 if (index($url, '(') < 0) {
112 $end = ")$end";
113 } else {
114 $url .= ')';
115 }
116 }
117 } elsif ($url !~ /\(/ && $url =~ s/\)\z//) {
118 $end = ')';
119 }
120
121 $beg . "<a href=\"$url\">$url</a>" . $end;
122 ^geo;
123 $s;
124}
125
126
32469801
AB
127my $out = '';
128
129# atom/rss feed header and footer
130if ($index and ($format eq 'atom' or $format eq 'rss')) {
131 if ($header) {
132 my $now_iso8601 = `date -Iseconds -u | tr -d \\\\n`;
133 my $now_rfc5322 = `date -uR | tr -d \\\\n`;
134 my $atom_rel = $format eq 'atom' ? 'self' : 'alternate';
135 my $rss_rel = $format eq 'rss' ? 'self' : 'alternate';
136 my $link = $format eq 'atom' ? 'link' : 'atom:link';
137 my $links = '';
138 if ($lang eq 'en') {
139 $links = qq(
140<$link hreflang="fa" href="https://bndl.org/fa/notes.atom" rel="alternate" type="application/atom+xml" />
141<$link hreflang="fa" href="https://bndl.org/fa/notes.rss" rel="alternate" type="application/rss+xml" />
142<$link hreflang="fa" href="https://bndl.org/fa/bandali.fa.txt" rel="alternate" type="text/plain" />
143<$link hreflang="fa" href="https://bndl.org/fa/" rel="alternate" type="text/html" />
144<$link href="https://bndl.org/notes.atom" rel="$atom_rel" type="application/atom+xml" />
145<$link href="https://bndl.org/notes.rss" rel="$rss_rel" type="application/rss+xml" />
146<$link href="https://bndl.org/bandali.txt" rel="alternate" type="text/plain" />
147<$link href="https://bndl.org" rel="alternate" type="text/html" />);
148 } elsif ($lang eq 'fa') {
149 $links = qq(
150<link hreflang="en" href="https://bndl.org/notes.atom" rel="alternate" type="application/atom+xml" />
151<link hreflang="en" href="https://bndl.org/notes.rss" rel="alternate" type="application/rss+xml" />
152<link hreflang="en" href="https://bndl.org/bandali.txt" rel="alternate" type="text/plain" />
153<link hreflang="en" href="https://bndl.org" rel="alternate" type="text/html" />
154<link href="https://bndl.org/fa/notes.atom" rel="$atom_rel" type="application/atom+xml" />
155<link href="https://bndl.org/fa/notes.rss" rel="$rss_rel" type="application/rss+xml" />
156<link href="https://bndl.org/fa/bandali.fa.txt" rel="alternate" type="text/plain" />
157<link href="https://bndl.org/fa/" rel="alternate" type="text/html" />);
158 }
159 $links =~ s/^\n//;
160
161 $out .= '<?xml version="1.0" encoding="UTF-8" ?>';
162 $out .= ($format eq 'atom') ? qq(
163<feed xml:lang="$lang" xmlns="http://www.w3.org/2005/Atom">
164<title>$site_title</title>
165<subtitle>$site_desc</subtitle>
166<id>$feed_id</id>
167$links
168<updated>$now_iso8601</updated>)
169 : ($format eq 'rss') ? qq(
170<rss version="2.0"
171 xmlns:atom="http://www.w3.org/2005/Atom"
172 xmlns:content="http://purl.org/rss/1.0/modules/content/">
173<channel>
174<title>$site_title</title>
175<description>$site_desc</description>
176<link>$site_url</link>
177<language>$lang</language>
178<lastBuildDate>$now_rfc5322</lastBuildDate>
179<pubDate>$now_rfc5322</pubDate>
180<ttl>1800</ttl>
181$links)
182 : '';
183 } elsif ($footer) {
184 $out .= ($format eq 'atom') ? '</feed>'
185 : ($format eq 'rss') ? '</channel></rss>'
186 : '';
187 }
188
189 # we're done
190 goto PRINT;
191}
192
193
d3adcff4 194my $txt = do { local $/; <STDIN> };
212ec2e6 195
d3adcff4
AB
196my $title = html_esc($txt =~ /\A([^\n]+)/);
197$title =~ s/^\s+|\s+$//g;
d29d6cb9 198$title .= " &#8212; $author" if $title !~ /$author/;
212ec2e6
AB
199
200my ($upd, $pub, $url) = $txt =~ /(.*)\r?\n(.*)\r?\n(.*)\r?\n?\z/;
201($upd) = $upd =~ /(?:updated|ویرایش): (.*)/ if $upd;
202($pub) = $pub =~ /(?:published|انتشار): (.*)/ if $pub;
203($url) = $url =~ /(?:plain text|متن ساده): (.*)/ if $url;
ce2fcdbc
AB
204$url = 'https://bndl.org/bandali-cv.txt'
205 if (!$url and $title =~ /curriculum vitae/);
206$url = html_esc($url) if $url;
d3adcff4
AB
207
208$txt = linkify(html_esc($txt));
209
32469801
AB
210
211my $upd_iso8601 = `date -Iseconds -ud '$upd' | tr -d \\\\n` if $upd;
212my $pub_iso8601 = `date -Iseconds -ud '$pub' | tr -d \\\\n` if $pub;
213my $pub_rfc5322 = `date -uRd '$pub' | tr -d \\\\n` if $pub;
22e69f33
AB
214my $url_html = $url =~ s/(?:[.]$lang)?[.]txt$/.html/r if $url;
215$url_html =~ s|/bandali-(.*)|/$1| if $url_html;
216my $slug = $url_html =~ s|.*/(.*)[.]html$|$1|r if $url_html;
217my $note_id = "$feed_id:$slug" if $url_html;
32469801
AB
218
219# note header
220if ($format eq 'html') {
221 $out .=
222 '<!doctype html>'
223 . qq(<html lang="$lang") . ($lang eq 'fa'
224 ? ' dir="rtl"'
225 : '' . '>')
226 . qq(<head>
ce2fcdbc 227<meta http-equiv="Content-Type"
32469801
AB
228content="text/html; charset=utf-8" />\n)
229 . "<title>$title</title>\n"
230 . qq(<link rel="icon" href="data:,">\n)
231 . ($url
232 ? qq(<link rel="alternate" href="$url"
233title="plain text" type="text/plain" />\n)
234 : '')
235 . (($index and $lang eq 'en')
236 ? qq(<link rel="alternate" href="https://bndl.org/fa/"
ce2fcdbc 237hreflang="fa" title="persian" />\n)
32469801
AB
238 : ($index and $lang eq 'fa')
239 ? qq(<link rel="alternate" href="https://bndl.org/"
ce2fcdbc 240hreflang="en" title="english" />\n)
32469801
AB
241 : '')
242 . qq(<style>\@media(prefers-color-scheme:dark){
19580079 243body{background:#1c1c1c;color:white;}a:link{color:#acdeff;}
32469801
AB
244a:visited{color:#f8f;}a:active{color:#e00;}})
245 . ($lang eq 'fa'
246 ? qq(\n\@font-face{font-family:sahel;font-weight:normal;
d3adcff4 247src:local('Sahel WOL'),local('Sahel'),
19580079 248url('sahel.woff2')format('woff2');}pre{font-family:sahel})
32469801
AB
249 : '')
250 . "</style>\n"
251 . '</head><body><pre>';
252} elsif ($format eq 'atom' or $format eq 'rss') {
253 my $atom_updated =
254 ($format eq 'atom') ? 'updated'
255 : ($format eq 'rss') ? 'atom:updated'
256 : '';
257 my $updated =
258 "<$atom_updated>$upd_iso8601</$atom_updated>\n" if $upd;
259 $out .= ($format eq 'atom') ? qq(
260<entry xml:base="$site_url">
261<author><name>$author</name></author>
262<id>$note_id</id>
263<published>$pub_iso8601</published>\n)
264. ($updated ? $updated : '') .
265qq(<link href="$url" rel="alternate" type="text/plain" />
266<link href="$url_html" rel="alternate" type="text/html" />
267<title>$title</title>
268<content type="html"><![CDATA[<pre>)
269 : ($format eq 'rss') ? qq(
270<item>
271<title>$title</title>
272<link>$url_html</link>
273<guid isPermaLink="false">$note_id</guid>
274<pubDate>$pub_rfc5322</pubDate>\n)
275. ($updated ? $updated : '') .
276qq(<content:encoded><![CDATA[<pre>)
277 : '';
278}
279# note body
280$out .= $txt;
281# note footer
282if ($format eq 'html') {
283 $out .= '</pre></body></html>';
284} elsif ($format eq 'atom') {
285 $out .= "</pre>]]></content></entry>";
286} elsif ($format eq 'rss') {
287 $out .= "</pre>]]></content:encoded></item>";
288}
289
290 PRINT:
291print("$out\n");
d3adcff4 292STDOUT->flush;