txt2html: use Getopt::Long for saner command line arg handling
[~bandali/bndl.org] / txt2html
CommitLineData
d3adcff4
AB
1#!/usr/bin/env perl
2# txt2html --- simple script to convert my site's txt files to html
3
4# Copyright (C) 2014-2021 all contributors <meta@public-inbox.org>
5# Copyright (c) 2021 Amin Bandali <bandali@gnu.org>
6#
7# This program is free software: you can redistribute it and/or modify
8# it under the terms of the GNU Affero General Public License as
9# published by the Free Software Foundation, either version 3 of the
10# License, or (at your option) any later version.
11#
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU Affero General Public License for more details.
16#
17# You should have received a copy of the GNU Affero General Public License
18# along with this program. If not, see <https://www.gnu.org/licenses/>.
19
20# This simple script borrows from the wonderful `txt2pre' from
21# public-inbox.git, under AGPLv3+, with a few additions of my own.
22
23
24use strict;
25use warnings 'all';
e02deb23
AB
26use Getopt::Long;
27
28my $opt_lang = 'en';
29GetOptions ('lang=s' => \$opt_lang)
30 or die("bad command line arguments\n");
d3adcff4
AB
31
32my $link_re =
33 qr{([\('!])?\b((?:ftps?|https?|nntps?|imaps?|s?news|gopher)://
34 [\@:\w\.-]+(?:/
35 (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*)
36 (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)?
37 (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)?
38 )?
39 )}xi;
40
41my %pairs = (
42 "(" => qr/(\)[\.,;\+]?)\z/, # Markdown (,), Ruby (+) (, for arrays)
43 "'" => qr/('[\.,;\+]?)\z/, # Perl / Ruby
44 "!" => qr/(![\.,;\+]?)\z/, # Perl / Ruby
45);
46
47my %html_map = (
48 '&' => '&amp;',
49 '<' => '&lt;',
50 '>' => '&gt;',
51 # '"' => '&quot;',
52 # "'" => '&#39;',
53);
54
55sub html_esc {
56 my ($s) = @_;
57 $s =~ s/([&<>])/$html_map{$1}/sge;
58 $s;
59}
60
61sub linkify {
62 my ($s) = @_;
63 $s =~ s^$link_re^
64 my $beg = $1 || '';
65 my $url = $2;
66 my $end = '';
67
68 # it's fairly common to end URLs in messages with
69 # '.', ',' or ';' to denote the end of a statement;
70 # assume the intent was to end the statement/sentence
71 # in English
72 if (defined(my $re = $pairs{$beg})) {
73 if ($url =~ s/$re//) {
74 $end = $1;
75 }
76 } elsif ($url =~ s/(\))?([\.,;])\z//) {
77 $end = $2;
78 # require ')' to be paired with '('
79 if (defined $1) { # ')'
80 if (index($url, '(') < 0) {
81 $end = ")$end";
82 } else {
83 $url .= ')';
84 }
85 }
86 } elsif ($url !~ /\(/ && $url =~ s/\)\z//) {
87 $end = ')';
88 }
89
90 $beg . "<a href=\"$url\">$url</a>" . $end;
91 ^geo;
92 $s;
93}
94
95
96my $txt = do { local $/; <STDIN> };
97my $title = html_esc($txt =~ /\A([^\n]+)/);
98$title =~ s/^\s+|\s+$//g;
e02deb23 99if ($opt_lang eq 'fa') {
d3adcff4
AB
100 $title .= ' &mdash; بندعلی' if $title !~ /بندعلی/;
101} else {
102 $title .= ' &mdash; bandali' if $title !~ /bandali/;
103}
104
105$txt = linkify(html_esc($txt));
106
107print('<!doctype html>',
e02deb23
AB
108 qq(<html lang="$opt_lang"),
109 $opt_lang eq 'fa' ? ' dir="rtl"' : '', '>',
110 '<head><meta
d3adcff4
AB
111http-equiv="Content-Type"
112content="text/html; charset=utf-8"
113/>',
114 "<title>$title</title>",
e02deb23 115 $opt_lang eq 'fa'
d3adcff4
AB
116 ? "\n<style>\@font-face{font-family:sahel;font-weight:normal;
117src:local('Sahel WOL'),local('Sahel'),
118url('sahel.woff2')format('woff2');}pre{font-family:sahel}</style>\n"
119 : '',
120 '</head><body><pre>', $txt, '</pre></body></html>');
121STDOUT->flush;