# API for reading and writing the Technical Report database files
#
# Markus Kuhn
package TechReports;
use 5.016;
use utf8;
use lib '../../scripts', '/anfs/www/VH-cl/scripts';
use CLWeb ();
use Digest::MD5 ();
use POSIX ();
use Time::Local ();
# Load the named tr-database.txt database file and return a hash table
# that contains both parameters and TechReport objects indexed by
# report number.
#
# That hash table also contains a few special entries aimed at the
# save routine, to help it preserve the order of included
# parameters as well as any comments or empty lines that precede
# a parameter or tech-report line.
sub load {
my ($class, $dbfile, $absfile, $timestampfile) = @_;
my $db = bless { } => $class;
my @missing;
my $nr = 0;
my $dbf;
open($dbf, "<:encoding(UTF-8)", $dbfile)
|| die("Can't open technical-report database '$dbfile':\n$!");
my @comments;
my @params;
while (<$dbf>) {
if (/^\s*\#/ || /^\s*$/) {
# record comments and empty lines
push @comments, $_;
} elsif (/^(\d+)\|([^|]*)\|([^|]*)\|([^|]*)\|(\d*)\|(.*)$/) {
# read and create report entry
if (exists $db->{$1}) {
die("$dbfile:$.:repeat occurrence of report number $1\n$_");
}
if ($1 < $nr) {
warn("$dbfile:$.:non-monotonic numbering from $nr to $1\n");
} elsif ($1 != $nr + 1) {
push @missing, ($nr+1)..($1-1);
}
$nr = $1;
my $tr = bless({ 'nr' => $nr, 'date' => $2, 'title' => $3,
'pages' => $5, 'db' => $db }
=> 'TechReport');
my @notes = split(/\|/, $6, -1);
if (@comments) {
$tr->{'_comment'} = [ @comments ];
@comments = ();
}
eval {
$tr->{'authors'} = [ map { TRAuthor::new($_) }
split(/, /,$4,-1) ];
};
# should contributors go into an extension of the author field
# rather than into a contrib= note?
die("$dbfile:$.:$@") if $@;
$tr->{'title'} =~ s/\\\\/\N{LINE SEPARATOR}/g;
die("$dbfile:$.:date field syntax error\n$_")
unless $tr->{date} =~ /^|\d{4}|\d{4}-\d{2}|\d{4}-\d{2}-\d{2}$/;
warn("$dbfile:$.:no author\n$_")
unless @{$tr->{'authors'}};
$db->{$nr} = $tr;
for my $note (@notes) {
if ($note =~ /^([a-z0-9-]+)=(.*)$/) {
my $key = $1;
my $value = $2;
die("$dbfile:$.:duplicate note '$key'\n")
if exists $tr->{$key};
$value =~ s/\\\\/\N{LINE SEPARATOR}/g;
$value =~ s/\\_/\N{PARAGRAPH SEPARATOR}/g;
$tr->{$key} = $value;
push @{$tr->{'_notes'}}, $key;
} elsif ($note =~ /^([a-z0-9-]+)$/) {
die("$dbfile:$.:duplicate note '$1'\n") if exists $tr->{$1};
$tr->{$1} = undef;
push @{$tr->{'_notes'}}, $1;
} else {
die("$dbfile:$.:syntax error in notes field '$note'\n$_");
}
}
} elsif (/^([a-z]+)=(.*)$/) {
# series parameters
my $key = $1;
my $value = $2;
push @params, $key;
$value =~ s/\\\\/\N{LINE SEPARATOR}/g;
$value =~ s/\\_/\N{PARAGRAPH SEPARATOR}/g;
$db->{$key} = $value;
if (@comments) {
$db->{"_$key comment"} = [ @comments ];
@comments = ();
}
} else {
die("$dbfile:$.:syntax error\n$_");
}
}
close($dbf);
$db->{'_final comment'} = [ @comments ] if @comments;
$db->{'_params'} = [ @params ] if @params;
warn("$dbfile:missing report numbers " . join(', ', @missing) . "\n")
if @missing && 0;
$db->load_abstracts($absfile) if defined $absfile;
$db->load_timestamps($timestampfile) if defined $timestampfile;
return $db;
}
# Read the abstracts into an already existing TechReports table
sub load_abstracts {
my ($db, $absfile) = @_;
# read in abstracts
my $abs;
open($abs, "<:encoding(UTF-8)", $absfile)
|| die("$absfile: cannot open abstracts database: $!\n");
# record comments and skip empty lines
my @comments;
while (<$abs>) {
last unless /^\s*\#/ || /^\s*$/;
push @comments, $_ unless /^\s*$/;
}
$db->{'_abstracts comment'} = [ @comments ] if @comments;
my $nr;
/^==([0-9]+)==+$/
or die("$absfile:$.:unexpected text before first separator line\n$_");
$nr = $1;
while (!eof($abs)) {
my $tr = $db->{int($nr)};
warn("$absfile:$.:found abstract for ".
"non-existing report number $nr\n") unless $tr;
die("$absfile:$.:found second abstract for number $nr\n")
if $tr && exists $tr->{'abstract'};
my @abstract;
while (<$abs>) {
if (/^==([0-9]+)==+$/) {
$nr = $1;
last;
};
push @abstract, $_;
}
my $abstract = join('', @abstract);
$abstract =~ s/\n+$/\n/s;
$abstract =~ s/(\S)\n(\S)/$1 $2/sg;
$tr->{'abstract'} = $abstract if $tr;
}
close($abs);
}
# list of technical reports
sub trs {
my ($db, $all) = @_;
my @trs = map { $db->{$_} } sort {$a <=> $b} grep { /^[0-9]+$/ } keys %$db;
return @trs if $all;
return grep {!exists $_->{deleted}} @trs;
}
# Convert a string indicating a range of report numbers (such as
# "100", "100-105", "-100", "100-") into a list of TechReport references.
# Several range-specifiers can be concatenated by comma.
sub range($) {
my ($db, $rlist) = @_;
my @list = ();
return $db->trs unless $rlist;
for $_ (split /,/, $rlist) {
my $first;
my $last;
if (/^(\d+)$/) {
$first = $last = $1;
} elsif (/^-(\d+)$/) {
$last = $1;
} elsif (/^(\d+)-$/) {
$first = $1;
} elsif (/^(\d+)-(\d+)$/) {
$first =$1;
$last = $2;
} else {
die("Unrecognized range specification '$rlist'!\n");
}
push @list, grep(!(defined $first && $_->{nr} < $first or
defined $last && $_->{nr} > $last),
$db->trs);
}
die("'$rlist' did not match any existing report number!\n")
unless @list;
return @list;
}
# Save a database of TR objects into a tr-database.txt file
sub save {
my ($db, $dbfile) = @_;
my @db;
my %params;
foreach my $p (grep { /^[a-z]+$/ } keys %{$db}) { $params{$p} = 1 };
# output loaded parameters in order in which there were loaded
# including any comments that preceded each parameter
foreach my $p (@{$db->{'_params'}}) {
push @db, @{$db->{"_$p comment"}};
next unless exists $params{$p};
my $value=$db->{$p};
$value =~ s/\N{LINE SEPARATOR}/\\\\/g;
$value =~ s/\N{PARAGRAPH SEPARATOR}/\\_/g;
push @db, "$p=$value\n";
delete $params{$p};
}
# then also output any newly added parameters
foreach my $p (sort keys %params) {
push @db, "$p=$db->{$p}\n";
}
foreach my $tr ($db->trs('all')) {
push @db, @{$tr->{'_comment'}} if $tr->{'_comment'};
push @db, $tr->db_line;
}
push @db, @{$db->{'_final comment'}} if exists $db->{'_final comment'};
CLWeb::write_file($dbfile, @db);
}
sub save_abstracts {
my ($db, $absfile) = @_;
my @abs;
push @abs, @{$db->{'_abstracts comment'}}
if exists $db->{'_abstracts comment'};
foreach my $tr ($db->trs('all')) {
next unless exists $tr->{'abstract'};
push @abs, "\n==$tr->{nr}================================================================\n";
push @abs, $tr->{'abstract'};
}
CLWeb::write_file($absfile, @abs);
}
sub load_timestamps {
my ($db, $timestampfile) = @_;
my $tf;
open($tf, "<", $timestampfile)
|| die("$timestampfile: cannot open timestamp database: $!\n");
my @modified;
my @deleted;
while (<$tf>) {
next if /^\s*\#/ || /^\s*$/; # skip comments and empty lines;
if (/^([0-9]+)\|(\d{4})(\d{2})(\d{2})T(\d{2})(\d{2})(\d{2})Z\|([0-9a-f]{32})$/) {
my $nr = $1;
my $lastmod = Time::Local::timegm($7, $6, $5, $4, $3-1, $2-1900);
my $lastdigest = $8;
die("$timestampfile:$.:odd timestamp\n$_")
unless $lastmod;
if (exists $db->{$nr}) {
my $tr = $db->{$nr};
if ($lastdigest ne $tr->digest) {
push @modified, $nr;
} else {
$tr->{'_lastmod'} = $lastmod;
}
} else {
push @{$db->{'_deleted'}}, $nr;
}
} else {
die("Syntax error in timestamp database line $.:\n$_");
}
}
close($tf);
# also find any newly added TechReports
my @added = map { $_->{nr} } grep { !exists $_->{'_lastmod'} } $db->trs;
push @{$db->{'_updated'}}, @modified, @added;
push @{$db->{'_deleted'}}, @deleted;
my $timestamp = time;
for my $nr (@modified, @added) {
$db->{$nr}{'_lastmod'} = $timestamp;
}
}
sub save_timestamps {
my ($db, $timestampfile) = @_;
my @ts = ("# TECHNICAL REPORT DATABASE MODIFICATION TIMESTAMPS\n".
"# AUTOMATICALLY GENERATED FILE -- DO NOT EDIT\n".
"# Format: nr|timestamp|md5-record-digest\n");
my $timestamp = time;
foreach my $tr ($db->trs('all')) {
$tr->{'_lastmod'} = $timestamp unless $tr->{'_lastmod'};
push @ts, ($tr->{nr}, '|', POSIX::strftime('%Y%m%dT%H%M%SZ',
gmtime($tr->{'_lastmod'})),
'|',$tr->digest,"\n");
}
CLWeb::write_file($timestampfile, @ts);
}
package TechReport;
# generate a single-line record of a TR in tr-database.txt format
sub db_line {
my ($tr) = @_;
my $title = $tr->{title};
$title =~ s/\N{LINE SEPARATOR}/\\\\/g;
my $line = "$tr->{nr}|$tr->{date}|$title|".
join(', ', map { $_->save } $tr->authors)."|$tr->{pages}|";
my @notes;
# append notes
foreach my $note (@{$tr->{'_notes'}}) {
my $value = $tr->{$note};
if (defined $value) {
$value =~ s/\N{LINE SEPARATOR}/\\\\/g;
$value =~ s/\N{PARAGRAPH SEPARATOR}/\\_/g;
push @notes, "$note=$value";
} else {
push @notes, $note;
}
}
return $line . join('|', @notes) . "\n";
}
sub digest {
my ($tr) = @_;
my $line = "$tr->{nr}.1|$tr->{date}|$tr->{title}|". # the '.1' was part no
join(', ', map { $_->save } $tr->authors)."|$tr->{pages}|";
my @notes;
# append notes
foreach my $note (@{$tr->{'_notes'}}) {
push @notes, defined $tr->{$note} ? "$note=$tr->{$note}" : $note;
}
$line .= join('|', @notes) . "\n";
my $preimage = Encode::encode_utf8($line . $tr->{'abstract'});
return Digest::MD5::md5_hex($preimage);
}
# full TR number
sub code {
my ($tr) = @_;
return "$tr->{db}{code}-$tr->{nr}";
}
# all available download files
sub files {
my ($tr) = @_;
my @files;
for my $suffix ( 'pdf', 'ps.gz', 'dvi.gz' ) {
my $fn = $tr->code.'.'.$suffix;
if (-r $fn && ((stat(_))[2] & Fcntl::S_IROTH)) {
push @files, $fn;
}
}
return @files;
}
# all available download formats (e.g. for DataCite Metadata Schema)
sub formats {
my ($tr) = @_;
my @formats;
for my $suffix ( 'pdf', 'ps.gz', 'dvi.gz' ) {
my $format = $suffix;
$format =~ s/\.gz$//;
my $fn = $tr->code.'.'.$suffix;
if (-r $fn && ((stat(_))[2] & Fcntl::S_IROTH)) {
push @formats, uc($format);
}
}
return @formats;
}
# URL of the full document file
sub url_file {
my ($tr) = @_;
return $tr->{url} if $tr->{url};
my @files = $tr->files;
$tr->{url} = $tr->{db}{reporturl} . $files[0] if @files;
return $tr->{url};
}
sub downloads {
my ($tr) = @_;
my @html;
for my $fn ($tr->files) {
$fn =~ /\.(\w+)(?:\.gz)?$/;
my $type = uc($1);
my $size = (stat($fn))[7];
push @html, '' . $type . '' .
sprintf(' (%.1f MB)', $size/1e6);
}
return join('
', @html);
}
# URL of the abstracts web page
sub url_abs {
my ($tr) = @_;
return "$tr->{db}{reporturl}$tr->{db}{code}-$tr->{nr}.html";
}
# TR title in a single line, i.e. with line separators replaced
sub titleline {
my ($tr) = @_;
my $s = $tr->{title};
return undef unless defined $s;
# Substitute all line separators in a string with " : " or
# -- if preceeded by punctuation or followed by parenthesis -- with " "
$s =~ s/([\p{Pd}\.:,;?!])\N{LINE SEPARATOR}/$1 /g;
$s =~ s/\N{LINE SEPARATOR}([\(])/ $1/g;
$s =~ s/\N{LINE SEPARATOR}/ : /g;
return $s;
}
sub html_title {
my ($tr) = @_;
return CLWeb::utf8_to_sgml($tr->titleline) || 'title missing';
}
sub html_doc {
my ($tr) = @_;
return ''.$tr->html_title.'';
}
sub html_abs {
my ($tr) = @_;
return ''.$tr->html_title.'';
}
sub authors {
my ($tr, $param) = @_;
my @authors = @{$tr->{authors}};
if ($param->{maxauthors} > 0 &&
@authors > $param->{maxauthors}) {
splice(@authors, $param->{maxauthors} - 1);
push @authors, $TRAuthor::et_al;
}
return @authors;
}
sub html_authors {
my ($tr, $param) = @_;
return join(', ', map {$_->html} $tr->authors($param));
}
# Does the authors field actually list the editors of the report?
# Assume yes if the title starts with "Proceedings ".
sub editors {
return $_[0]->{title} =~ /^Proceedings /;
}
# parse the contrib field and return a list
# of contributions and contributors the form
# ( contribution1 => [ contributor1, controbutor2 ],
# contributionw => [ contributor3 ], ... )
sub contributions {
my ($tr) = @_;
my @contributions;
for my $c (split(';;', $tr->{contrib})) {
if ($c =~ /^(.*)::(.*)$/) {
my $contribution = $1;
my $contributors = $2;
my @contributors;
eval {
@contributors =
map { TRAuthor::new($_) } split(/, /,$contributors,-1);
};
die($tr->code.":cannot parse contrib field:$@") if $@;
push @contributions, $contribution => \@contributors;
}
}
return @contributions;
}
# get just a list of the contributors, without identifying the contribution
sub contributors {
my ($tr) = @_;
my @contributions = $tr->contributions;
my @contributors;
while (@contributions) {
my $contribution = shift @contributions;
my $contributors = shift @contributions;
push @contributors, @{$contributors};
}
return @contributors;
}
# if it is a thesis, return degree, college and submission-date
sub thesis {
my ($tr) = @_;
if (my @thesis = grep(/^phd|ophd|mphil|cst/, @{$tr->{_notes}})) {
if ($tr->{$thesis[0]} =~ /^(?:([^,]*)(?:,([\d-]*))?)?$/) {
my $degree=$thesis[0];
my $college=$1;
my $submission=$2;
die("TR-$tr->{nr}:syntax error in submission date '$submission'")
unless ($submission =~ /^(\d{4}(-\d{2}(-\d{2})?)?)?$/);
return ($degree, $college, $submission);
} else {
die("TR-$tr->{nr}:",
"invalid thesis note '$thesis[0]=$tr->{$thesis[0]}'\n");
}
}
return;
}
# is this a CL PhD?
sub is_phd($) {
my ($self) = @_;
return exists $self->{phd};
}
sub submitted($) {
return ($_[0]->thesis)[2];
}
# estimate publication year (needed e.g. for DOI registration)
sub year {
my ($tr, $dir) = @_;
# if we have a publication date, simply take the year from that
if ($tr->{date} =~ /^(\d{4})/) { return $1 };
# otherwise, if the nearest preceeding and following report
# with a year show the same year, take that
if ($dir) {
# recurse one way only subsequently
return $tr->{db}{$tr->{nr}+$dir}->year($dir)
if $tr->{db}{$tr->{nr}+$dir};
} else {
# recurse both ways at first
my ($prev, $next);
$prev = $tr->{db}{$tr->{nr}-1}->year(-1)
if $tr->{db}{$tr->{nr}-1};
$next = $tr->{db}{$tr->{nr}+1}->year(+1)
if $tr->{db}{$tr->{nr}+1};
if ($prev eq $next) { return $prev; }
}
# otherwise, if the report is based on a thesis,
# use the submission date recorded for that
# (this can often be a year earlier than submission,
# due to the length of the examination process)
if ($tr->submitted =~ /^(\d{4})/) { return $1 };
return;
}
# returns the licence name, URL, and short-code applicable to the report,
# or an empty list in case the default licence applies
sub licence {
my ($tr) = @_;
if (my $cc = $tr->{'cc'}) {
if ($cc eq 'by') {
return
('Creative Commons Attribution 4.0 International (CC BY 4.0)',
'https://creativecommons.org/licenses/by/4.0/', $cc);
} elsif ($cc eq 'by-sa') {
return
('Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)',
'https://creativecommons.org/licenses/by-sa/4.0/', $cc);
} elsif ($cc eq 'by-nd') {
return
('Creative Commons Attribution-NoDerivatives 4.0 International (CC BY-ND 4.0)',
'https://creativecommons.org/licenses/by-nd/4.0/', $cc);
} else {
die("Unknown CC licence '$cc' in ", $tr->code, "\n");
}
}
return;
}
# a bibliographic reference for the report
sub html {
my ($tr) = @_;
my @html;
push @html, $tr->html_authors . ': ' if $tr->authors;
push @html, $tr->html_abs;
push @html, ', ' . expand_date($tr->{date}) if $tr->{date};
push @html, ".";
my @ss = ();
push @ss, "$tr->{pages} pages" if $tr->{pages};
push @ss, "PhD thesis"
if grep(/^o?phd/, @{$tr->{_notes}});
push @ss, "MPhil thesis"
if grep(/^mphil/, @{$tr->{_notes}});
push @html, ' (' . join(', ', @ss) . ')' if @ss;
return @html;
}
sub doi {
my ($tr) = @_;
return if $tr->{'to-appear'};
my $doiprefix = $tr->{db}{doiprefix};
return "$doiprefix/tr-$tr->{nr}" if $doiprefix;
return;
}
# https://blog.datacite.org/introducing-datacite-json/
# https://doi.test.datacite.org/dois/10.81026%2Ftr-577
sub datacite_json {
my ($tr) = @_;
my $doi = $tr->doi;
die("No DOI prefix defined\n") unless $doi;
my $j = {
id => "https://doi.org/" . $tr->doi,
doi => $doi,
url => $tr->url_abs,
types => {
bibtex => 'TechReport',
resourceType => 'technical report',
resourceTypeGeneral => 'Report'
},
};
for my $author ($tr->authors) {
my $c = {
name => $author->fullname,
};
if ($author->{forenames} && $author->{surname}) {
$c->{givenName} = $author->{forenames};
$c->{familyName} = $author->{surname};
$c->{nameType} = 'Personal';
}
push @{$j->{creators}}, $c;
};
push @{$j->{titles}}, {
lang => 'en',
title => $tr->{title},
} if $tr->{title};
$j->{publisher} = $tr->{db}->{department} .
", University of Cambridge" if $tr->{db}->{department};
my $year = $tr->year;
$j->{publicationYear} = $tr->year if $year;
push @{$j->{sizes}}, "$tr->{pages} pages" if $tr->{pages};
my @formats = $tr->formats;
push @{$j->{formats}}, @formats if @formats;
push @{$j->{descriptions}}, {
lang => 'en',
description => $tr->{abstract},
descriptionType => 'Abstract',
} if $tr->{abstract};
return $j;
}
# some auxiliary functions
use Exporter 'import';
our @EXPORT_OK = qw(expand_date expand_date_us);
our @month = ('???', 'January', 'February', 'March', 'April', 'May', 'June',
'July', 'August', 'September', 'October', 'November', 'December');
# turn '2002-01-14' into '14 January 2002', etc.
sub expand_date {
my ($d) = @_;
if ($d =~ /^(\d{4})-?(\d{2})-?(\d{2})$/) {
return sprintf("%d ", $3) . $month[$2] . " $1";
} elsif ($d =~ /^(\d{4})-?(\d{2})$/) {
return $month[$2] . " $1";
}
return $d;
}
# turn '2002-01-14' into 'January 14, 2002', etc.
sub expand_date_us {
my ($d) = @_;
if ($d =~ /^(\d{4})-?(\d{2})-?(\d{2})$/) {
return $month[$2] . sprintf(" %d", $3) . ", $1";
} elsif ($d =~ /^(\d{4})-?(\d{2})$/) {
return $month[$2] . " $1";
}
return $d;
}
package TRAuthor;
use strict;
use utf8;
our $et_al = bless { surname => 'et al.' } => 'TRAuthor';
sub new {
my ($s) = @_;
my $a = bless {} => 'TRAuthor';
if ($s =~ /^(?:(.+) )?([^ <>]+)(?: <([a-z]+[0-9]*)>)?$/) {
$a->{'forenames'} = $1 if $1;
$a->{'surname'} = $2;
$a->{'crsid'} = $3 if $3;
$a->{'surname'} =~ s/ / /g; # any NBSPs in surname are actually spaces
} else {
die("Unexpected format of author name: '$s'\n");
}
return $a;
}
sub fullname {
my ($self) = @_;
return join(' ', grep { defined } $self->{forenames}, $self->{surname});
}
sub surnamefirst {
my ($self) = @_;
return join(', ', grep { defined } $self->{surname}, $self->{forenames});
}
sub sortname {
my ($self) = @_;
return join(' ', $self->{surname}, $self->{forenames}, $self->{crsid});
}
sub url {
my ($self) = @_;
return $self->{url} if $self->{url};
my $url;
my $verbose = 0;
if (!$self->{url} && $self->{crsid}) {
print STDERR "homepage('$self->{crsid}') = " if $verbose;
$url = CLWeb::homepage_url($self->{crsid});
print STDERR "'$url'\n" if $verbose;
$self->{url} = $url if $url;
}
return $self->{url};
}
sub html {
my ($self, $param) = @_;
my $name = $param->{surnamefirst} ? $self->surnamefirst : $self->fullname;
my $html = CLWeb::utf8_to_sgml(CLWeb::nobreak $name);
my $url = $self->url;
$html = '' . $html . '' if $url;
return $html;
}
# return author data as a string as formatted in tr-database.txt
sub save($) {
my ($a) = @_;
my $s = $a->{surname};
$s =~ s/ / /g; # encode any space in surname as NBSP
$s = $a->{forenames}.' '.$s if $a->{forenames};
$s .= " <$a->{crsid}>" if $a->{crsid};
return $s;
}
1;