Commit | Line | Data |
---|---|---|
d3adcff4 | 1 | #!/usr/bin/env perl |
22e69f33 | 2 | # txt2pre --- convert my site's txt files to `pre'-based atom/rss/html |
d3adcff4 AB |
3 | |
4 | # Copyright (C) 2014-2021 all contributors <meta@public-inbox.org> | |
5 | # Copyright (c) 2021 Amin Bandali <bandali@gnu.org> | |
6 | # | |
7 | # This program is free software: you can redistribute it and/or modify | |
8 | # it under the terms of the GNU Affero General Public License as | |
9 | # published by the Free Software Foundation, either version 3 of the | |
10 | # License, or (at your option) any later version. | |
11 | # | |
12 | # This program is distributed in the hope that it will be useful, | |
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | # GNU Affero General Public License for more details. | |
16 | # | |
17 | # You should have received a copy of the GNU Affero General Public License | |
18 | # along with this program. If not, see <https://www.gnu.org/licenses/>. | |
19 | ||
4ce760d0 AB |
20 | # This simple script borrows from a script of the same name from the |
21 | # wonderful public-inbox project, under AGPLv3+, with additions of | |
22 | # my own. | |
d3adcff4 AB |
23 | |
24 | ||
25 | use strict; | |
26 | use warnings 'all'; | |
e02deb23 AB |
27 | use Getopt::Long; |
28 | ||
32469801 AB |
29 | my $format = 'html'; |
30 | my $lang = 'en'; | |
31 | my $index = ''; | |
32 | my $header = ''; | |
33 | my $footer = ''; | |
34 | ||
35 | GetOptions ('format=s' => \$format, | |
36 | 'lang=s' => \$lang, | |
37 | 'index' => \$index, | |
38 | 'header' => \$header, | |
39 | 'footer' => \$footer) | |
e02deb23 | 40 | or die("bad command line arguments\n"); |
d3adcff4 | 41 | |
32469801 AB |
42 | my $author = |
43 | $lang eq 'en' ? 'bandali' | |
44 | : $lang eq 'fa' ? 'بندعلی' | |
45 | : ''; | |
46 | my $site_title = | |
47 | $lang eq 'en' ? "${author}'s personal site" | |
48 | : $lang eq 'fa' ? "سایت شخصی $author" | |
49 | : ''; | |
50 | my $site_desc = | |
51 | $lang eq 'en' ? "notes and blog posts by $author" | |
52 | : $lang eq 'fa' ? "نوشتهها و بلاگ پستهای $author" | |
53 | : ''; | |
54 | my $site_url = | |
55 | ($lang eq 'en') ? 'https://bndl.org' | |
56 | : ($lang eq 'fa') ? 'https://bndl.org/fa/' | |
57 | : ''; | |
58 | my $feed_id = | |
59 | ($lang eq 'en') ? "tag:bndl.org,2020:notes.$format" | |
60 | : ($lang eq 'fa') ? "tag:bndl.org,2020:fa/notes.$format" | |
61 | : ''; | |
62 | ||
d3adcff4 AB |
63 | my $link_re = |
64 | qr{([\('!])?\b((?:ftps?|https?|nntps?|imaps?|s?news|gopher):// | |
65 | [\@:\w\.-]+(?:/ | |
66 | (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*) | |
67 | (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)? | |
68 | (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)? | |
69 | )? | |
70 | )}xi; | |
71 | ||
72 | my %pairs = ( | |
73 | "(" => qr/(\)[\.,;\+]?)\z/, # Markdown (,), Ruby (+) (, for arrays) | |
74 | "'" => qr/('[\.,;\+]?)\z/, # Perl / Ruby | |
75 | "!" => qr/(![\.,;\+]?)\z/, # Perl / Ruby | |
76 | ); | |
77 | ||
78 | my %html_map = ( | |
79 | '&' => '&', | |
80 | '<' => '<', | |
81 | '>' => '>', | |
82 | # '"' => '"', | |
83 | # "'" => ''', | |
84 | ); | |
85 | ||
86 | sub html_esc { | |
87 | my ($s) = @_; | |
88 | $s =~ s/([&<>])/$html_map{$1}/sge; | |
89 | $s; | |
90 | } | |
91 | ||
92 | sub linkify { | |
93 | my ($s) = @_; | |
94 | $s =~ s^$link_re^ | |
95 | my $beg = $1 || ''; | |
96 | my $url = $2; | |
97 | my $end = ''; | |
98 | ||
99 | # it's fairly common to end URLs in messages with | |
100 | # '.', ',' or ';' to denote the end of a statement; | |
101 | # assume the intent was to end the statement/sentence | |
102 | # in English | |
103 | if (defined(my $re = $pairs{$beg})) { | |
104 | if ($url =~ s/$re//) { | |
105 | $end = $1; | |
106 | } | |
107 | } elsif ($url =~ s/(\))?([\.,;])\z//) { | |
108 | $end = $2; | |
109 | # require ')' to be paired with '(' | |
110 | if (defined $1) { # ')' | |
111 | if (index($url, '(') < 0) { | |
112 | $end = ")$end"; | |
113 | } else { | |
114 | $url .= ')'; | |
115 | } | |
116 | } | |
117 | } elsif ($url !~ /\(/ && $url =~ s/\)\z//) { | |
118 | $end = ')'; | |
119 | } | |
120 | ||
121 | $beg . "<a href=\"$url\">$url</a>" . $end; | |
122 | ^geo; | |
123 | $s; | |
124 | } | |
125 | ||
126 | ||
32469801 AB |
127 | my $out = ''; |
128 | ||
129 | # atom/rss feed header and footer | |
130 | if ($index and ($format eq 'atom' or $format eq 'rss')) { | |
131 | if ($header) { | |
132 | my $now_iso8601 = `date -Iseconds -u | tr -d \\\\n`; | |
133 | my $now_rfc5322 = `date -uR | tr -d \\\\n`; | |
134 | my $atom_rel = $format eq 'atom' ? 'self' : 'alternate'; | |
135 | my $rss_rel = $format eq 'rss' ? 'self' : 'alternate'; | |
136 | my $link = $format eq 'atom' ? 'link' : 'atom:link'; | |
137 | my $links = ''; | |
138 | if ($lang eq 'en') { | |
139 | $links = qq( | |
140 | <$link hreflang="fa" href="https://bndl.org/fa/notes.atom" rel="alternate" type="application/atom+xml" /> | |
141 | <$link hreflang="fa" href="https://bndl.org/fa/notes.rss" rel="alternate" type="application/rss+xml" /> | |
142 | <$link hreflang="fa" href="https://bndl.org/fa/bandali.fa.txt" rel="alternate" type="text/plain" /> | |
143 | <$link hreflang="fa" href="https://bndl.org/fa/" rel="alternate" type="text/html" /> | |
144 | <$link href="https://bndl.org/notes.atom" rel="$atom_rel" type="application/atom+xml" /> | |
145 | <$link href="https://bndl.org/notes.rss" rel="$rss_rel" type="application/rss+xml" /> | |
146 | <$link href="https://bndl.org/bandali.txt" rel="alternate" type="text/plain" /> | |
147 | <$link href="https://bndl.org" rel="alternate" type="text/html" />); | |
148 | } elsif ($lang eq 'fa') { | |
149 | $links = qq( | |
150 | <link hreflang="en" href="https://bndl.org/notes.atom" rel="alternate" type="application/atom+xml" /> | |
151 | <link hreflang="en" href="https://bndl.org/notes.rss" rel="alternate" type="application/rss+xml" /> | |
152 | <link hreflang="en" href="https://bndl.org/bandali.txt" rel="alternate" type="text/plain" /> | |
153 | <link hreflang="en" href="https://bndl.org" rel="alternate" type="text/html" /> | |
154 | <link href="https://bndl.org/fa/notes.atom" rel="$atom_rel" type="application/atom+xml" /> | |
155 | <link href="https://bndl.org/fa/notes.rss" rel="$rss_rel" type="application/rss+xml" /> | |
156 | <link href="https://bndl.org/fa/bandali.fa.txt" rel="alternate" type="text/plain" /> | |
157 | <link href="https://bndl.org/fa/" rel="alternate" type="text/html" />); | |
158 | } | |
159 | $links =~ s/^\n//; | |
160 | ||
161 | $out .= '<?xml version="1.0" encoding="UTF-8" ?>'; | |
162 | $out .= ($format eq 'atom') ? qq( | |
163 | <feed xml:lang="$lang" xmlns="http://www.w3.org/2005/Atom"> | |
164 | <title>$site_title</title> | |
165 | <subtitle>$site_desc</subtitle> | |
166 | <id>$feed_id</id> | |
167 | $links | |
168 | <updated>$now_iso8601</updated>) | |
169 | : ($format eq 'rss') ? qq( | |
170 | <rss version="2.0" | |
171 | xmlns:atom="http://www.w3.org/2005/Atom" | |
172 | xmlns:content="http://purl.org/rss/1.0/modules/content/"> | |
173 | <channel> | |
174 | <title>$site_title</title> | |
175 | <description>$site_desc</description> | |
176 | <link>$site_url</link> | |
177 | <language>$lang</language> | |
178 | <lastBuildDate>$now_rfc5322</lastBuildDate> | |
179 | <pubDate>$now_rfc5322</pubDate> | |
180 | <ttl>1800</ttl> | |
181 | $links) | |
182 | : ''; | |
183 | } elsif ($footer) { | |
184 | $out .= ($format eq 'atom') ? '</feed>' | |
185 | : ($format eq 'rss') ? '</channel></rss>' | |
186 | : ''; | |
187 | } | |
188 | ||
189 | # we're done | |
190 | goto PRINT; | |
191 | } | |
192 | ||
193 | ||
d3adcff4 | 194 | my $txt = do { local $/; <STDIN> }; |
212ec2e6 | 195 | |
d3adcff4 AB |
196 | my $title = html_esc($txt =~ /\A([^\n]+)/); |
197 | $title =~ s/^\s+|\s+$//g; | |
d29d6cb9 | 198 | $title .= " — $author" if $title !~ /$author/; |
212ec2e6 AB |
199 | |
200 | my ($upd, $pub, $url) = $txt =~ /(.*)\r?\n(.*)\r?\n(.*)\r?\n?\z/; | |
201 | ($upd) = $upd =~ /(?:updated|ویرایش): (.*)/ if $upd; | |
202 | ($pub) = $pub =~ /(?:published|انتشار): (.*)/ if $pub; | |
203 | ($url) = $url =~ /(?:plain text|متن ساده): (.*)/ if $url; | |
ce2fcdbc AB |
204 | $url = 'https://bndl.org/bandali-cv.txt' |
205 | if (!$url and $title =~ /curriculum vitae/); | |
206 | $url = html_esc($url) if $url; | |
d3adcff4 AB |
207 | |
208 | $txt = linkify(html_esc($txt)); | |
209 | ||
32469801 AB |
210 | |
211 | my $upd_iso8601 = `date -Iseconds -ud '$upd' | tr -d \\\\n` if $upd; | |
212 | my $pub_iso8601 = `date -Iseconds -ud '$pub' | tr -d \\\\n` if $pub; | |
213 | my $pub_rfc5322 = `date -uRd '$pub' | tr -d \\\\n` if $pub; | |
22e69f33 AB |
214 | my $url_html = $url =~ s/(?:[.]$lang)?[.]txt$/.html/r if $url; |
215 | $url_html =~ s|/bandali-(.*)|/$1| if $url_html; | |
216 | my $slug = $url_html =~ s|.*/(.*)[.]html$|$1|r if $url_html; | |
217 | my $note_id = "$feed_id:$slug" if $url_html; | |
32469801 AB |
218 | |
219 | # note header | |
220 | if ($format eq 'html') { | |
221 | $out .= | |
222 | '<!doctype html>' | |
223 | . qq(<html lang="$lang") . ($lang eq 'fa' | |
224 | ? ' dir="rtl"' | |
225 | : '' . '>') | |
226 | . qq(<head> | |
ce2fcdbc | 227 | <meta http-equiv="Content-Type" |
32469801 AB |
228 | content="text/html; charset=utf-8" />\n) |
229 | . "<title>$title</title>\n" | |
230 | . qq(<link rel="icon" href="data:,">\n) | |
231 | . ($url | |
232 | ? qq(<link rel="alternate" href="$url" | |
233 | title="plain text" type="text/plain" />\n) | |
234 | : '') | |
235 | . (($index and $lang eq 'en') | |
236 | ? qq(<link rel="alternate" href="https://bndl.org/fa/" | |
ce2fcdbc | 237 | hreflang="fa" title="persian" />\n) |
32469801 AB |
238 | : ($index and $lang eq 'fa') |
239 | ? qq(<link rel="alternate" href="https://bndl.org/" | |
ce2fcdbc | 240 | hreflang="en" title="english" />\n) |
32469801 AB |
241 | : '') |
242 | . qq(<style>\@media(prefers-color-scheme:dark){ | |
19580079 | 243 | body{background:#1c1c1c;color:white;}a:link{color:#acdeff;} |
32469801 AB |
244 | a:visited{color:#f8f;}a:active{color:#e00;}}) |
245 | . ($lang eq 'fa' | |
246 | ? qq(\n\@font-face{font-family:sahel;font-weight:normal; | |
d3adcff4 | 247 | src:local('Sahel WOL'),local('Sahel'), |
19580079 | 248 | url('sahel.woff2')format('woff2');}pre{font-family:sahel}) |
32469801 AB |
249 | : '') |
250 | . "</style>\n" | |
251 | . '</head><body><pre>'; | |
252 | } elsif ($format eq 'atom' or $format eq 'rss') { | |
253 | my $atom_updated = | |
254 | ($format eq 'atom') ? 'updated' | |
255 | : ($format eq 'rss') ? 'atom:updated' | |
256 | : ''; | |
257 | my $updated = | |
258 | "<$atom_updated>$upd_iso8601</$atom_updated>\n" if $upd; | |
259 | $out .= ($format eq 'atom') ? qq( | |
260 | <entry xml:base="$site_url"> | |
261 | <author><name>$author</name></author> | |
262 | <id>$note_id</id> | |
263 | <published>$pub_iso8601</published>\n) | |
264 | . ($updated ? $updated : '') . | |
265 | qq(<link href="$url" rel="alternate" type="text/plain" /> | |
266 | <link href="$url_html" rel="alternate" type="text/html" /> | |
267 | <title>$title</title> | |
268 | <content type="html"><![CDATA[<pre>) | |
269 | : ($format eq 'rss') ? qq( | |
270 | <item> | |
271 | <title>$title</title> | |
272 | <link>$url_html</link> | |
273 | <guid isPermaLink="false">$note_id</guid> | |
274 | <pubDate>$pub_rfc5322</pubDate>\n) | |
275 | . ($updated ? $updated : '') . | |
276 | qq(<content:encoded><![CDATA[<pre>) | |
277 | : ''; | |
278 | } | |
279 | # note body | |
280 | $out .= $txt; | |
281 | # note footer | |
282 | if ($format eq 'html') { | |
283 | $out .= '</pre></body></html>'; | |
284 | } elsif ($format eq 'atom') { | |
285 | $out .= "</pre>]]></content></entry>"; | |
286 | } elsif ($format eq 'rss') { | |
287 | $out .= "</pre>]]></content:encoded></item>"; | |
288 | } | |
289 | ||
290 | PRINT: | |
291 | print("$out\n"); | |
d3adcff4 | 292 | STDOUT->flush; |