#!/usr/bin/env perl # txt2pre --- convert my site's txt files to `pre'-based atom/rss/html # Copyright (C) 2014-2021 all contributors # Copyright (c) 2021 Amin Bandali # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # This simple script borrows from a script of the same name from the # wonderful public-inbox project, under AGPLv3+, with additions of # my own. use strict; use warnings 'all'; use Getopt::Long; my $format = 'html'; my $lang = 'en'; my $index = ''; my $header = ''; my $footer = ''; GetOptions ('format=s' => \$format, 'lang=s' => \$lang, 'index' => \$index, 'header' => \$header, 'footer' => \$footer) or die("bad command line arguments\n"); my $author = $lang eq 'en' ? 'bandali' : $lang eq 'fa' ? 'بندعلی' : ''; my $site_title = $lang eq 'en' ? "${author}'s personal site" : $lang eq 'fa' ? "سایت شخصی $author" : ''; my $site_desc = $lang eq 'en' ? "notes and blog posts by $author" : $lang eq 'fa' ? "نوشته‌ها و بلاگ پست‌های $author" : ''; my $site_url = ($lang eq 'en') ? 'https://bndl.org' : ($lang eq 'fa') ? 'https://bndl.org/fa/' : ''; my $feed_id = ($lang eq 'en') ? "tag:bndl.org,2020:notes.$format" : ($lang eq 'fa') ? "tag:bndl.org,2020:fa/notes.$format" : ''; my $link_re = qr{([\('!])?\b((?:ftps?|https?|nntps?|imaps?|s?news|gopher):// [\@:\w\.-]+(?:/ (?:[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]*) (?:\?[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%]+)? (?:\#[a-z0-9\-\._~!\$\&\';\(\)\*\+,;=:@/%\?]+)? )? )}xi; my %pairs = ( "(" => qr/(\)[\.,;\+]?)\z/, # Markdown (,), Ruby (+) (, for arrays) "'" => qr/('[\.,;\+]?)\z/, # Perl / Ruby "!" => qr/(![\.,;\+]?)\z/, # Perl / Ruby ); my %html_map = ( '&' => '&', '<' => '<', '>' => '>', # '"' => '"', # "'" => ''', ); sub html_esc { my ($s) = @_; $s =~ s/([&<>])/$html_map{$1}/sge; $s; } sub linkify { my ($s) = @_; $s =~ s^$link_re^ my $beg = $1 || ''; my $url = $2; my $end = ''; # it's fairly common to end URLs in messages with # '.', ',' or ';' to denote the end of a statement; # assume the intent was to end the statement/sentence # in English if (defined(my $re = $pairs{$beg})) { if ($url =~ s/$re//) { $end = $1; } } elsif ($url =~ s/(\))?([\.,;])\z//) { $end = $2; # require ')' to be paired with '(' if (defined $1) { # ')' if (index($url, '(') < 0) { $end = ")$end"; } else { $url .= ')'; } } } elsif ($url !~ /\(/ && $url =~ s/\)\z//) { $end = ')'; } $beg . "$url" . $end; ^geo; $s; } my $out = ''; # atom/rss feed header and footer if ($index and ($format eq 'atom' or $format eq 'rss')) { if ($header) { my $now_iso8601 = `date -Iseconds -u | tr -d \\\\n`; my $now_rfc5322 = `date -uR | tr -d \\\\n`; my $atom_rel = $format eq 'atom' ? 'self' : 'alternate'; my $rss_rel = $format eq 'rss' ? 'self' : 'alternate'; my $link = $format eq 'atom' ? 'link' : 'atom:link'; my $links = ''; if ($lang eq 'en') { $links = qq( <$link hreflang="fa" href="https://bndl.org/fa/notes.atom" rel="alternate" type="application/atom+xml" /> <$link hreflang="fa" href="https://bndl.org/fa/notes.rss" rel="alternate" type="application/rss+xml" /> <$link hreflang="fa" href="https://bndl.org/fa/bandali.fa.txt" rel="alternate" type="text/plain" /> <$link hreflang="fa" href="https://bndl.org/fa/" rel="alternate" type="text/html" /> <$link href="https://bndl.org/notes.atom" rel="$atom_rel" type="application/atom+xml" /> <$link href="https://bndl.org/notes.rss" rel="$rss_rel" type="application/rss+xml" /> <$link href="https://bndl.org/bandali.txt" rel="alternate" type="text/plain" /> <$link href="https://bndl.org" rel="alternate" type="text/html" />); } elsif ($lang eq 'fa') { $links = qq( ); } $links =~ s/^\n//; $out .= ''; $out .= ($format eq 'atom') ? qq( $site_title $site_desc $feed_id $links $now_iso8601) : ($format eq 'rss') ? qq( $site_title $site_desc $site_url $lang $now_rfc5322 $now_rfc5322 1800 $links) : ''; } elsif ($footer) { $out .= ($format eq 'atom') ? '' : ($format eq 'rss') ? '' : ''; } # we're done goto PRINT; } my $txt = do { local $/; }; my $title = html_esc($txt =~ /\A([^\n]+)/); $title =~ s/^\s+|\s+$//g; $title .= " — $author" if $title !~ /$author/; my ($upd, $pub, $url) = $txt =~ /(.*)\r?\n(.*)\r?\n(.*)\r?\n?\z/; ($upd) = $upd =~ /(?:updated|ویرایش): (.*)/ if $upd; ($pub) = $pub =~ /(?:published|انتشار): (.*)/ if $pub; $upd = $pub if (!$upd); ($url) = $url =~ /(?:plain text|متن ساده): (.*)/ if $url; $url = 'https://bndl.org/bandali-cv.txt' if (!$url and $title =~ /curriculum vitae/); $url = html_esc($url) if $url; $txt = linkify(html_esc($txt)); my $upd_iso8601 = `date -Iseconds -ud '$upd' | tr -d \\\\n` if $upd; my $pub_iso8601 = `date -Iseconds -ud '$pub' | tr -d \\\\n` if $pub; my $pub_rfc5322 = `date -uRd '$pub' | tr -d \\\\n` if $pub; my $url_html = $url =~ s/(?:[.]$lang)?[.]txt$/.html/r if $url; $url_html =~ s|/bandali-(.*)|/$1| if $url_html; my $slug = $url_html =~ s|.*/(.*)[.]html$|$1|r if $url_html; my $note_id = "$feed_id:$slug" if $url_html; # note header if ($format eq 'html') { $out .= '' . qq(') . qq( \n) . "$title\n" . qq(\n) . ($url ? qq(\n) : '') . (($index and $lang eq 'en') ? qq(\n) : ($index and $lang eq 'fa') ? qq(\n) : '') . qq(\n" . '
';
} elsif ($format eq 'atom' or $format eq 'rss') {
    my $atom_updated =
        ($format eq 'atom') ? 'updated'
        : ($format eq 'rss') ? 'atom:updated'
        : '';
    my $updated =
        "<$atom_updated>$upd_iso8601\n" if $upd;
    $out .= ($format eq 'atom') ? qq(

$author
$note_id
$pub_iso8601\n)
. ($updated ? $updated : '') .
qq(

$title
)
        : ($format eq 'rss') ? qq(

$title
$url_html
$note_id
$pub_rfc5322\n)
. (($updated and $pub ne $upd) ? $updated : '') .
qq()
        : '';
}
# note body
$out .= $txt;
# note footer
if ($format eq 'html') {
    $out .= '
'; } elsif ($format eq 'atom') { $out .= "]]>"; } elsif ($format eq 'rss') { $out .= "]]>"; } PRINT: print("$out\n"); STDOUT->flush;