small tweaks and cleanups
[~bandali/bndl.org] / txt2html
CommitLineData
d3adcff4
AB
1#!/usr/bin/env perl
2# txt2html --- simple script to convert my site's txt files to html
3
4# Copyright (C) 2014-2021 all contributors <meta@public-inbox.org>
5# Copyright (c) 2021 Amin Bandali <bandali@gnu.org>
6#
7# This program is free software: you can redistribute it and/or modify
8# it under the terms of the GNU Affero General Public License as
9# published by the Free Software Foundation, either version 3 of the
10# License, or (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU Affero General Public License for more details.
16#
17# You should have received a copy of the GNU Affero General Public License
18# along with this program. If not, see <https://www.gnu.org/licenses/>.
19
20# This simple script borrows from the wonderful `txt2pre' from
21# public-inbox.git, under AGPLv3+, with a few additions of my own.
22
23
24use strict;
25use warnings 'all';
e02deb23
AB
26use Getopt::Long;
27
28my $opt_lang = 'en';
ce2fcdbc
AB
29my $opt_index;
30GetOptions ('lang=s' => \$opt_lang,
31 'index' => \$opt_index)
e02deb23 32 or die("bad command line arguments\n");
d3adcff4
AB
33
34my $link_re =
35 qr{([\('!])?\b((?:ftps?|https?|nntps?|imaps?|s?news|gopher)://
36 [\@:\w\.-]+(?:/
37 (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*)
38 (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)?
39 (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)?
40 )?
41 )}xi;
42
43my %pairs = (
44 "(" => qr/(\)[\.,;\+]?)\z/, # Markdown (,), Ruby (+) (, for arrays)
45 "'" => qr/('[\.,;\+]?)\z/, # Perl / Ruby
46 "!" => qr/(![\.,;\+]?)\z/, # Perl / Ruby
47);
48
49my %html_map = (
50 '&' => '&amp;',
51 '<' => '&lt;',
52 '>' => '&gt;',
53 # '"' => '&quot;',
54 # "'" => '&#39;',
55);
56
57sub html_esc {
58 my ($s) = @_;
59 $s =~ s/([&<>])/$html_map{$1}/sge;
60 $s;
61}
62
63sub linkify {
64 my ($s) = @_;
65 $s =~ s^$link_re^
66 my $beg = $1 || '';
67 my $url = $2;
68 my $end = '';
69
70 # it's fairly common to end URLs in messages with
71 # '.', ',' or ';' to denote the end of a statement;
72 # assume the intent was to end the statement/sentence
73 # in English
74 if (defined(my $re = $pairs{$beg})) {
75 if ($url =~ s/$re//) {
76 $end = $1;
77 }
78 } elsif ($url =~ s/(\))?([\.,;])\z//) {
79 $end = $2;
80 # require ')' to be paired with '('
81 if (defined $1) { # ')'
82 if (index($url, '(') < 0) {
83 $end = ")$end";
84 } else {
85 $url .= ')';
86 }
87 }
88 } elsif ($url !~ /\(/ && $url =~ s/\)\z//) {
89 $end = ')';
90 }
91
92 $beg . "<a href=\"$url\">$url</a>" . $end;
93 ^geo;
94 $s;
95}
96
97
98my $txt = do { local $/; <STDIN> };
99my $title = html_esc($txt =~ /\A([^\n]+)/);
100$title =~ s/^\s+|\s+$//g;
e02deb23 101if ($opt_lang eq 'fa') {
d3adcff4
AB
102 $title .= ' &mdash; بندعلی' if $title !~ /بندعلی/;
103} else {
104 $title .= ' &mdash; bandali' if $title !~ /bandali/;
105}
ce2fcdbc
AB
106my ($url) = $txt =~ /(?:plain text|متن ساده): (.*)\r?\n?\z/;
107$url = 'https://bndl.org/bandali-cv.txt'
108 if (!$url and $title =~ /curriculum vitae/);
109$url = html_esc($url) if $url;
d3adcff4
AB
110
111$txt = linkify(html_esc($txt));
112
ce2fcdbc 113print("<!doctype html>",
e02deb23 114 qq(<html lang="$opt_lang"),
ce2fcdbc
AB
115 $opt_lang eq 'fa' ? ' dir="rtl"' : '',
116 ">",
117 qq(<head>
118<meta http-equiv="Content-Type"
119content="text/html; charset=utf-8" />\n),
120 "<title>$title</title>\n",
121 $url ? qq(<link rel="alternate" href="$url"
122title="plain text" type="text/plain" />\n) : '',
123 ($opt_index and $opt_lang eq 'en')
124 ? qq(<link rel="alternate" href="https://bndl.org/fa/"
125hreflang="fa" title="persian" />\n)
126 : ($opt_index and $opt_lang eq 'fa')
127 ? qq(<link rel="alternate" href="https://bndl.org/"
128hreflang="en" title="english" />\n)
129 : '',
e02deb23 130 $opt_lang eq 'fa'
ce2fcdbc 131 ? qq(<style>\@font-face{font-family:sahel;font-weight:normal;
d3adcff4 132src:local('Sahel WOL'),local('Sahel'),
ce2fcdbc 133url('sahel.woff2')format('woff2');}pre{font-family:sahel}</style>\n)
d3adcff4 134 : '',
ce2fcdbc 135 "</head><body><pre>$txt</pre></body></html>\n");
d3adcff4 136STDOUT->flush;