# API for reading and writing the Technical Report database files # # Markus Kuhn package TechReports; use 5.016; use utf8; use lib '../../scripts', '/anfs/www/VH-cl/scripts'; use CLWeb (); use Digest::MD5 (); use POSIX (); use Time::Local (); # Load the named tr-database.txt database file and return a hash table # that contains both parameters and TechReport objects indexed by # report number. # # That hash table also contains a few special entries aimed at the # save routine, to help it preserve the order of included # parameters as well as any comments or empty lines that precede # a parameter or tech-report line. sub load { my ($class, $dbfile, $absfile, $timestampfile) = @_; my $db = bless { } => $class; my @missing; my $nr = 0; my $dbf; open($dbf, "<:encoding(UTF-8)", $dbfile) || die("Can't open technical-report database '$dbfile':\n$!"); my @comments; my @params; while (<$dbf>) { if (/^\s*\#/ || /^\s*$/) { # record comments and empty lines push @comments, $_; } elsif (/^(\d+)\|([^|]*)\|([^|]*)\|([^|]*)\|(\d*)\|(.*)$/) { # read and create report entry if (exists $db->{$1}) { die("$dbfile:$.:repeat occurrence of report number $1\n$_"); } if ($1 < $nr) { warn("$dbfile:$.:non-monotonic numbering from $nr to $1\n"); } elsif ($1 != $nr + 1) { push @missing, ($nr+1)..($1-1); } $nr = $1; my $tr = bless({ 'nr' => $nr, 'date' => $2, 'title' => $3, 'pages' => $5, 'db' => $db } => 'TechReport'); my @notes = split(/\|/, $6, -1); if (@comments) { $tr->{'_comment'} = [ @comments ]; @comments = (); } eval { $tr->{'authors'} = [ map { TRAuthor::new($_) } split(/, /,$4,-1) ]; }; # should contributors go into an extension of the author field # rather than into a contrib= note? die("$dbfile:$.:$@") if $@; $tr->{'title'} =~ s/\\\\/\N{LINE SEPARATOR}/g; die("$dbfile:$.:date field syntax error\n$_") unless $tr->{date} =~ /^|\d{4}|\d{4}-\d{2}|\d{4}-\d{2}-\d{2}$/; warn("$dbfile:$.:no author\n$_") unless @{$tr->{'authors'}}; $db->{$nr} = $tr; for my $note (@notes) { if ($note =~ /^([a-z0-9-]+)=(.*)$/) { my $key = $1; my $value = $2; die("$dbfile:$.:duplicate note '$key'\n") if exists $tr->{$key}; $value =~ s/\\\\/\N{LINE SEPARATOR}/g; $value =~ s/\\_/\N{PARAGRAPH SEPARATOR}/g; $tr->{$key} = $value; push @{$tr->{'_notes'}}, $key; } elsif ($note =~ /^([a-z0-9-]+)$/) { die("$dbfile:$.:duplicate note '$1'\n") if exists $tr->{$1}; $tr->{$1} = undef; push @{$tr->{'_notes'}}, $1; } else { die("$dbfile:$.:syntax error in notes field '$note'\n$_"); } } } elsif (/^([a-z]+)=(.*)$/) { # series parameters my $key = $1; my $value = $2; push @params, $key; $value =~ s/\\\\/\N{LINE SEPARATOR}/g; $value =~ s/\\_/\N{PARAGRAPH SEPARATOR}/g; $db->{$key} = $value; if (@comments) { $db->{"_$key comment"} = [ @comments ]; @comments = (); } } else { die("$dbfile:$.:syntax error\n$_"); } } close($dbf); $db->{'_final comment'} = [ @comments ] if @comments; $db->{'_params'} = [ @params ] if @params; warn("$dbfile:missing report numbers " . join(', ', @missing) . "\n") if @missing && 0; $db->load_abstracts($absfile) if defined $absfile; $db->load_timestamps($timestampfile) if defined $timestampfile; return $db; } # Read the abstracts into an already existing TechReports table sub load_abstracts { my ($db, $absfile) = @_; # read in abstracts my $abs; open($abs, "<:encoding(UTF-8)", $absfile) || die("$absfile: cannot open abstracts database: $!\n"); # record comments and skip empty lines my @comments; while (<$abs>) { last unless /^\s*\#/ || /^\s*$/; push @comments, $_ unless /^\s*$/; } $db->{'_abstracts comment'} = [ @comments ] if @comments; my $nr; /^==([0-9]+)==+$/ or die("$absfile:$.:unexpected text before first separator line\n$_"); $nr = $1; while (!eof($abs)) { my $tr = $db->{int($nr)}; warn("$absfile:$.:found abstract for ". "non-existing report number $nr\n") unless $tr; die("$absfile:$.:found second abstract for number $nr\n") if $tr && exists $tr->{'abstract'}; my @abstract; while (<$abs>) { if (/^==([0-9]+)==+$/) { $nr = $1; last; }; push @abstract, $_; } my $abstract = join('', @abstract); $abstract =~ s/\n+$/\n/s; $abstract =~ s/(\S)\n(\S)/$1 $2/sg; $tr->{'abstract'} = $abstract if $tr; } close($abs); } # list of technical reports sub trs { my ($db, $all) = @_; my @trs = map { $db->{$_} } sort {$a <=> $b} grep { /^[0-9]+$/ } keys %$db; return @trs if $all; return grep {!exists $_->{deleted}} @trs; } # Convert a string indicating a range of report numbers (such as # "100", "100-105", "-100", "100-") into a list of TechReport references. # Several range-specifiers can be concatenated by comma. sub range($) { my ($db, $rlist) = @_; my @list = (); return $db->trs unless $rlist; for $_ (split /,/, $rlist) { my $first; my $last; if (/^(\d+)$/) { $first = $last = $1; } elsif (/^-(\d+)$/) { $last = $1; } elsif (/^(\d+)-$/) { $first = $1; } elsif (/^(\d+)-(\d+)$/) { $first =$1; $last = $2; } else { die("Unrecognized range specification '$rlist'!\n"); } push @list, grep(!(defined $first && $_->{nr} < $first or defined $last && $_->{nr} > $last), $db->trs); } die("'$rlist' did not match any existing report number!\n") unless @list; return @list; } # Save a database of TR objects into a tr-database.txt file sub save { my ($db, $dbfile) = @_; my @db; my %params; foreach my $p (grep { /^[a-z]+$/ } keys %{$db}) { $params{$p} = 1 }; # output loaded parameters in order in which there were loaded # including any comments that preceded each parameter foreach my $p (@{$db->{'_params'}}) { push @db, @{$db->{"_$p comment"}}; next unless exists $params{$p}; my $value=$db->{$p}; $value =~ s/\N{LINE SEPARATOR}/\\\\/g; $value =~ s/\N{PARAGRAPH SEPARATOR}/\\_/g; push @db, "$p=$value\n"; delete $params{$p}; } # then also output any newly added parameters foreach my $p (sort keys %params) { push @db, "$p=$db->{$p}\n"; } foreach my $tr ($db->trs('all')) { push @db, @{$tr->{'_comment'}} if $tr->{'_comment'}; push @db, $tr->db_line; } push @db, @{$db->{'_final comment'}} if exists $db->{'_final comment'}; CLWeb::write_file($dbfile, @db); } sub save_abstracts { my ($db, $absfile) = @_; my @abs; push @abs, @{$db->{'_abstracts comment'}} if exists $db->{'_abstracts comment'}; foreach my $tr ($db->trs('all')) { next unless exists $tr->{'abstract'}; push @abs, "\n==$tr->{nr}================================================================\n"; push @abs, $tr->{'abstract'}; } CLWeb::write_file($absfile, @abs); } sub load_timestamps { my ($db, $timestampfile) = @_; my $tf; open($tf, "<", $timestampfile) || die("$timestampfile: cannot open timestamp database: $!\n"); my @modified; my @deleted; while (<$tf>) { next if /^\s*\#/ || /^\s*$/; # skip comments and empty lines; if (/^([0-9]+)\|(\d{4})(\d{2})(\d{2})T(\d{2})(\d{2})(\d{2})Z\|([0-9a-f]{32})$/) { my $nr = $1; my $lastmod = Time::Local::timegm($7, $6, $5, $4, $3-1, $2-1900); my $lastdigest = $8; die("$timestampfile:$.:odd timestamp\n$_") unless $lastmod; if (exists $db->{$nr}) { my $tr = $db->{$nr}; if ($lastdigest ne $tr->digest) { push @modified, $nr; } else { $tr->{'_lastmod'} = $lastmod; } } else { push @{$db->{'_deleted'}}, $nr; } } else { die("Syntax error in timestamp database line $.:\n$_"); } } close($tf); # also find any newly added TechReports my @added = map { $_->{nr} } grep { !exists $_->{'_lastmod'} } $db->trs; push @{$db->{'_updated'}}, @modified, @added; push @{$db->{'_deleted'}}, @deleted; my $timestamp = time; for my $nr (@modified, @added) { $db->{$nr}{'_lastmod'} = $timestamp; } } sub save_timestamps { my ($db, $timestampfile) = @_; my @ts = ("# TECHNICAL REPORT DATABASE MODIFICATION TIMESTAMPS\n". "# AUTOMATICALLY GENERATED FILE -- DO NOT EDIT\n". "# Format: nr|timestamp|md5-record-digest\n"); my $timestamp = time; foreach my $tr ($db->trs('all')) { $tr->{'_lastmod'} = $timestamp unless $tr->{'_lastmod'}; push @ts, ($tr->{nr}, '|', POSIX::strftime('%Y%m%dT%H%M%SZ', gmtime($tr->{'_lastmod'})), '|',$tr->digest,"\n"); } CLWeb::write_file($timestampfile, @ts); } package TechReport; # generate a single-line record of a TR in tr-database.txt format sub db_line { my ($tr) = @_; my $title = $tr->{title}; $title =~ s/\N{LINE SEPARATOR}/\\\\/g; my $line = "$tr->{nr}|$tr->{date}|$title|". join(', ', map { $_->save } $tr->authors)."|$tr->{pages}|"; my @notes; # append notes foreach my $note (@{$tr->{'_notes'}}) { my $value = $tr->{$note}; if (defined $value) { $value =~ s/\N{LINE SEPARATOR}/\\\\/g; $value =~ s/\N{PARAGRAPH SEPARATOR}/\\_/g; push @notes, "$note=$value"; } else { push @notes, $note; } } return $line . join('|', @notes) . "\n"; } sub digest { my ($tr) = @_; my $line = "$tr->{nr}.1|$tr->{date}|$tr->{title}|". # the '.1' was part no join(', ', map { $_->save } $tr->authors)."|$tr->{pages}|"; my @notes; # append notes foreach my $note (@{$tr->{'_notes'}}) { push @notes, defined $tr->{$note} ? "$note=$tr->{$note}" : $note; } $line .= join('|', @notes) . "\n"; my $preimage = Encode::encode_utf8($line . $tr->{'abstract'}); return Digest::MD5::md5_hex($preimage); } # full TR number sub code { my ($tr) = @_; return "$tr->{db}{code}-$tr->{nr}"; } # all available download files sub files { my ($tr) = @_; my @files; for my $suffix ( 'pdf', 'ps.gz', 'dvi.gz' ) { my $fn = $tr->code.'.'.$suffix; if (-r $fn && ((stat(_))[2] & Fcntl::S_IROTH)) { push @files, $fn; } } return @files; } # all available download formats (e.g. for DataCite Metadata Schema) sub formats { my ($tr) = @_; my @formats; for my $suffix ( 'pdf', 'ps.gz', 'dvi.gz' ) { my $format = $suffix; $format =~ s/\.gz$//; my $fn = $tr->code.'.'.$suffix; if (-r $fn && ((stat(_))[2] & Fcntl::S_IROTH)) { push @formats, uc($format); } } return @formats; } # URL of the full document file sub url_file { my ($tr) = @_; return $tr->{url} if $tr->{url}; my @files = $tr->files; $tr->{url} = $tr->{db}{reporturl} . $files[0] if @files; return $tr->{url}; } sub downloads { my ($tr) = @_; my @html; for my $fn ($tr->files) { $fn =~ /\.(\w+)(?:\.gz)?$/; my $type = uc($1); my $size = (stat($fn))[7]; push @html, '' . $type . '' . sprintf(' (%.1f MB)', $size/1e6); } return join('
', @html); } # URL of the abstracts web page sub url_abs { my ($tr) = @_; return "$tr->{db}{reporturl}$tr->{db}{code}-$tr->{nr}.html"; } # TR title in a single line, i.e. with line separators replaced sub titleline { my ($tr) = @_; my $s = $tr->{title}; return undef unless defined $s; # Substitute all line separators in a string with " : " or # -- if preceeded by punctuation or followed by parenthesis -- with " " $s =~ s/([\p{Pd}\.:,;?!])\N{LINE SEPARATOR}/$1 /g; $s =~ s/\N{LINE SEPARATOR}([\(])/ $1/g; $s =~ s/\N{LINE SEPARATOR}/ : /g; return $s; } sub html_title { my ($tr) = @_; return CLWeb::utf8_to_sgml($tr->titleline) || 'title missing'; } sub html_doc { my ($tr) = @_; return ''.$tr->html_title.''; } sub html_abs { my ($tr) = @_; return ''.$tr->html_title.''; } sub authors { my ($tr, $param) = @_; my @authors = @{$tr->{authors}}; if ($param->{maxauthors} > 0 && @authors > $param->{maxauthors}) { splice(@authors, $param->{maxauthors} - 1); push @authors, $TRAuthor::et_al; } return @authors; } sub html_authors { my ($tr, $param) = @_; return join(', ', map {$_->html} $tr->authors($param)); } # Does the authors field actually list the editors of the report? # Assume yes if the title starts with "Proceedings ". sub editors { return $_[0]->{title} =~ /^Proceedings /; } # parse the contrib field and return a list # of contributions and contributors the form # ( contribution1 => [ contributor1, controbutor2 ], # contributionw => [ contributor3 ], ... ) sub contributions { my ($tr) = @_; my @contributions; for my $c (split(';;', $tr->{contrib})) { if ($c =~ /^(.*)::(.*)$/) { my $contribution = $1; my $contributors = $2; my @contributors; eval { @contributors = map { TRAuthor::new($_) } split(/, /,$contributors,-1); }; die($tr->code.":cannot parse contrib field:$@") if $@; push @contributions, $contribution => \@contributors; } } return @contributions; } # get just a list of the contributors, without identifying the contribution sub contributors { my ($tr) = @_; my @contributions = $tr->contributions; my @contributors; while (@contributions) { my $contribution = shift @contributions; my $contributors = shift @contributions; push @contributors, @{$contributors}; } return @contributors; } # if it is a thesis, return degree, college and submission-date sub thesis { my ($tr) = @_; if (my @thesis = grep(/^phd|ophd|mphil|cst/, @{$tr->{_notes}})) { if ($tr->{$thesis[0]} =~ /^(?:([^,]*)(?:,([\d-]*))?)?$/) { my $degree=$thesis[0]; my $college=$1; my $submission=$2; die("TR-$tr->{nr}:syntax error in submission date '$submission'") unless ($submission =~ /^(\d{4}(-\d{2}(-\d{2})?)?)?$/); return ($degree, $college, $submission); } else { die("TR-$tr->{nr}:", "invalid thesis note '$thesis[0]=$tr->{$thesis[0]}'\n"); } } return; } # is this a CL PhD? sub is_phd($) { my ($self) = @_; return exists $self->{phd}; } sub submitted($) { return ($_[0]->thesis)[2]; } # estimate publication year (needed e.g. for DOI registration) sub year { my ($tr, $dir) = @_; # if we have a publication date, simply take the year from that if ($tr->{date} =~ /^(\d{4})/) { return $1 }; # otherwise, if the nearest preceeding and following report # with a year show the same year, take that if ($dir) { # recurse one way only subsequently return $tr->{db}{$tr->{nr}+$dir}->year($dir) if $tr->{db}{$tr->{nr}+$dir}; } else { # recurse both ways at first my ($prev, $next); $prev = $tr->{db}{$tr->{nr}-1}->year(-1) if $tr->{db}{$tr->{nr}-1}; $next = $tr->{db}{$tr->{nr}+1}->year(+1) if $tr->{db}{$tr->{nr}+1}; if ($prev eq $next) { return $prev; } } # otherwise, if the report is based on a thesis, # use the submission date recorded for that # (this can often be a year earlier than submission, # due to the length of the examination process) if ($tr->submitted =~ /^(\d{4})/) { return $1 }; return; } # returns the licence name, URL, and short-code applicable to the report, # or an empty list in case the default licence applies sub licence { my ($tr) = @_; if (my $cc = $tr->{'cc'}) { if ($cc eq 'by') { return ('Creative Commons Attribution 4.0 International (CC BY 4.0)', 'https://creativecommons.org/licenses/by/4.0/', $cc); } elsif ($cc eq 'by-sa') { return ('Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)', 'https://creativecommons.org/licenses/by-sa/4.0/', $cc); } elsif ($cc eq 'by-nd') { return ('Creative Commons Attribution-NoDerivatives 4.0 International (CC BY-ND 4.0)', 'https://creativecommons.org/licenses/by-nd/4.0/', $cc); } else { die("Unknown CC licence '$cc' in ", $tr->code, "\n"); } } return; } # a bibliographic reference for the report sub html { my ($tr) = @_; my @html; push @html, $tr->html_authors . ': ' if $tr->authors; push @html, $tr->html_abs; push @html, ', ' . expand_date($tr->{date}) if $tr->{date}; push @html, "."; my @ss = (); push @ss, "$tr->{pages} pages" if $tr->{pages}; push @ss, "PhD thesis" if grep(/^o?phd/, @{$tr->{_notes}}); push @ss, "MPhil thesis" if grep(/^mphil/, @{$tr->{_notes}}); push @html, ' (' . join(', ', @ss) . ')' if @ss; return @html; } sub doi { my ($tr) = @_; return if $tr->{'to-appear'}; my $doiprefix = $tr->{db}{doiprefix}; return "$doiprefix/tr-$tr->{nr}" if $doiprefix; return; } # https://blog.datacite.org/introducing-datacite-json/ # https://doi.test.datacite.org/dois/10.81026%2Ftr-577 sub datacite_json { my ($tr) = @_; my $doi = $tr->doi; die("No DOI prefix defined\n") unless $doi; my $j = { id => "https://doi.org/" . $tr->doi, doi => $doi, url => $tr->url_abs, types => { bibtex => 'TechReport', resourceType => 'technical report', resourceTypeGeneral => 'Report' }, }; for my $author ($tr->authors) { my $c = { name => $author->fullname, }; if ($author->{forenames} && $author->{surname}) { $c->{givenName} = $author->{forenames}; $c->{familyName} = $author->{surname}; $c->{nameType} = 'Personal'; } push @{$j->{creators}}, $c; }; push @{$j->{titles}}, { lang => 'en', title => $tr->{title}, } if $tr->{title}; $j->{publisher} = $tr->{db}->{department} . ", University of Cambridge" if $tr->{db}->{department}; my $year = $tr->year; $j->{publicationYear} = $tr->year if $year; push @{$j->{sizes}}, "$tr->{pages} pages" if $tr->{pages}; my @formats = $tr->formats; push @{$j->{formats}}, @formats if @formats; push @{$j->{descriptions}}, { lang => 'en', description => $tr->{abstract}, descriptionType => 'Abstract', } if $tr->{abstract}; return $j; } # some auxiliary functions use Exporter 'import'; our @EXPORT_OK = qw(expand_date expand_date_us); our @month = ('???', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'); # turn '2002-01-14' into '14 January 2002', etc. sub expand_date { my ($d) = @_; if ($d =~ /^(\d{4})-?(\d{2})-?(\d{2})$/) { return sprintf("%d ", $3) . $month[$2] . " $1"; } elsif ($d =~ /^(\d{4})-?(\d{2})$/) { return $month[$2] . " $1"; } return $d; } # turn '2002-01-14' into 'January 14, 2002', etc. sub expand_date_us { my ($d) = @_; if ($d =~ /^(\d{4})-?(\d{2})-?(\d{2})$/) { return $month[$2] . sprintf(" %d", $3) . ", $1"; } elsif ($d =~ /^(\d{4})-?(\d{2})$/) { return $month[$2] . " $1"; } return $d; } package TRAuthor; use strict; use utf8; our $et_al = bless { surname => 'et al.' } => 'TRAuthor'; sub new { my ($s) = @_; my $a = bless {} => 'TRAuthor'; if ($s =~ /^(?:(.+) )?([^ <>]+)(?: <([a-z]+[0-9]*)>)?$/) { $a->{'forenames'} = $1 if $1; $a->{'surname'} = $2; $a->{'crsid'} = $3 if $3; $a->{'surname'} =~ s/ / /g; # any NBSPs in surname are actually spaces } else { die("Unexpected format of author name: '$s'\n"); } return $a; } sub fullname { my ($self) = @_; return join(' ', grep { defined } $self->{forenames}, $self->{surname}); } sub surnamefirst { my ($self) = @_; return join(', ', grep { defined } $self->{surname}, $self->{forenames}); } sub sortname { my ($self) = @_; return join(' ', $self->{surname}, $self->{forenames}, $self->{crsid}); } sub url { my ($self) = @_; return $self->{url} if $self->{url}; my $url; my $verbose = 0; if (!$self->{url} && $self->{crsid}) { print STDERR "homepage('$self->{crsid}') = " if $verbose; $url = CLWeb::homepage_url($self->{crsid}); print STDERR "'$url'\n" if $verbose; $self->{url} = $url if $url; } return $self->{url}; } sub html { my ($self, $param) = @_; my $name = $param->{surnamefirst} ? $self->surnamefirst : $self->fullname; my $html = CLWeb::utf8_to_sgml(CLWeb::nobreak $name); my $url = $self->url; $html = '' . $html . '' if $url; return $html; } # return author data as a string as formatted in tr-database.txt sub save($) { my ($a) = @_; my $s = $a->{surname}; $s =~ s/ / /g; # encode any space in surname as NBSP $s = $a->{forenames}.' '.$s if $a->{forenames}; $s .= " <$a->{crsid}>" if $a->{crsid}; return $s; } 1;