#!/usr/bin/perl # Generate the Computer Laboratory's PhD thesis list at # # http://www.cl.cam.ac.uk/local/phd/list/ # # using data from the Research_Students table in the lab's SQLServer # admin database # # Usage: # sudo -u wwwupdate \ # /anfs/www/VH-cl/scripts/phd.pl /anfs/www/VH-cl/html/local/phd/list/index-b.html # # Markus Kuhn -- 2008-01-07 package TechReport; use strict; use utf8; sub load($) { my ($dbfile) = @_; my %tr; my $db; open($db, "<:utf8", $dbfile) || die("Can't open technical-report database '$dbfile':\n$!\n"); while (<$db>) { next if /^\s*\#/ || /^\s*$/; # skip comments and empty lines; if (/^(\d+)(?:\.([1-9]\d*))?\|([^|]*)\|([^|]*)\|([^|]*)\|(\d*)\|(.*)$/) { # read and create report entry my $tr = bless({ 'nr' => $1, 'date' => $3, 'title' => $4, 'pages' => $6 } => "TechReport"); $tr->{'volume'} = $2 if $2; $tr->{'authors'} = [ map {bless \$_ => 'Author' } split(/, /,$5) ]; $tr{$1} = $tr; for my $note (split(/\|/, $7)) { if ($note =~ /^([a-z-]+)=(.*)$/) { $tr->{$1} = $2; } elsif ($note =~ /^([a-z-]+)$/) { $tr->{$1} = undef; } else { die("Syntax error in notes field of " . "report database line $.:\n$_"); } } } elsif (/^([a-z]+)=(.*)$/) { # series parameters $tr{$1} = $2; } else { die("Syntax error in report database line $.:\n$_"); } } close($db); return %tr; } sub is_phd($) { my ($self) = @_; return exists $self->{phd}; } sub submitted($) { my ($self) = @_; return unless exists $self->{phd}; return "" unless defined $self->{phd}; # for TR-527 if ($self->{phd} =~ /^(.*),(\d{4}(-\d{2}(-\d{2})?)?)?$/) { return $2; } else { warn("TR-".$self->{nr}.": phd note '".$self->{phd}."'!\n") } } sub submitted_before($$) { my ($self, $before) = @_; return 0 unless exists $self->{phd}; return 0 unless defined $self->{phd}; # for TR-527 if ($self->{phd} =~ /^(.*),(\d{4}(-\d{2}(-\d{2})?)?)$/) { return $2 lt $before; } elsif ($self->{phd} =~ /,$/) { return 0; } else { warn("TR-".$self->{nr}.": phd note '".$self->{phd}."'!\n"); } } package Author; use strict; use utf8; sub fullname($) { my ($self) = @_; my $name = ${$self}; $name =~ s/ / /g; return $name; } sub surname($) { my ($self) = @_; my $name = ${$self}; $name =~ s/^.+ ([^ ]+)$/$1/; $name =~ s/ / /g; return $name; } sub forenames($) { my ($self) = @_; my $name = ${$self}; $name =~ s/^(.+) [^ ]+$/$1/; return $name; } sub firstname($) { my ($self) = @_; my $name = ${$self}; $name =~ s/^([^ ]+) [^ ]+$/$1/; $name =~ s/ / /g; return $name; } package main; use strict; require DBI; require Time::Local; require '/anfs/www/tools/share/PasswordVault.pm'; use POSIX qw(strftime); use Encode; use utf8; use Unicode::Normalize; use charnames ':full'; binmode STDOUT, ":utf8"; binmode STDERR, ":utf8"; # PhD TRs that went through the Engineering Degree Committee (DTG) my %dtg = ('Kasim Rehman' => 1, 'Alastair R. Beresford' => 1, 'Ford Long Wong' => 1); # Some people changed their name between registration of their PhD # and submitting their thesis in a way that is difficult to match # automatically. We list their aliases here temporarily, but such # information should really come from a proper database table. my %rename = ('John Rooney' => 'Sean Rooney', 'Quentin Stafford-Fraser' => 'James Quentin Stafford-Fraser', 'Monica Nesi Thery' => 'Monica Nesi', 'Olexiy Gotsman' => 'Alexey Gotsman', 'Oeistein Andersen' => 'Øistein E. Andersen', 'Ola Elsayed' => 'Ola Mahmoud', ); # directory where student-admin keeps local PHD copies my $local_pdfs = '/usr/groups/studentadmin/PhD_theses'; # who maintains table my $studentadmin = 'Lise Gough'; # convert a date string (e.g., whatever the database returns) into the # corresponding time(2) value in seconds since the system epoch # (1970-01-01 00:00 UTC on Unix). my %monthno = ( 'Jan' => 1, 'Feb' => 2, 'Mar' => 3, 'Apr' => 4, 'May' => 5, 'Jun' => 6, 'Jul' => 7, 'Aug' => 8, 'Sep' => 9, 'Oct' =>10, 'Nov' =>11, 'Dec' =>12 ); sub normalize_date { my ($s) = @_; my $t; return unless defined $s || length($s); if ($s =~ /^(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2}(?:\.\d*))$/) { $t = Time::Local::timelocal($6, $5, $4, $3, $2-1, $1-1900); } elsif ($s =~ /^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) +(\d{1,2}) (\d{4}) (\d{2}):(\d{2})([AP])M$/) { $t = Time::Local::timelocal(0, $5, $4 + ($6 eq 'A' ? ($4 == 12 ? -12 : 0) : ($4 < 12 ? 12 : 0)), $2, $monthno{$1}-1, $3-1900); } else { die("Unknown date format: '$s'\n"); } return strftime('%Y-%m-%d', localtime $t); } sub normalize_string($) { my ($s) = @_; $s =~ s/-/ /g; # replace hyphen $s =~ s/ü/ue/g; # German umlaut $s = NFD($s); # Normalization Form D $s =~ s/[\p{Mn}\p{Me}]//g; # remove combining characters $s =~ s/\N{LATIN SMALL LIGATURE OE}/oe/g; $s =~ s/\N{LATIN CAPITAL LIGATURE OE}/OE/g; $s =~ s/\N{LATIN SMALL LETTER AE}/ae/g; $s =~ s/\N{LATIN CAPITAL LETTER AE}/AE/g; $s =~ s/\N{LATIN SMALL LETTER A WITH RING ABOVE}/aa/g; $s =~ s/\N{LATIN CAPITAL LETTER A WITH RING ABOVE}/AA/g; $s =~ s/\N{LATIN SMALL LETTER O WITH STROKE}/o/g; $s =~ s/\N{LATIN CAPITAL LETTER O WITH STROKE}/O/g; $s =~ s/\N{LATIN SMALL LETTER L WITH STROKE}/l/g; $s =~ s/\N{LATIN CAPITAL LETTER L WITH STROKE}/L/g; $s =~ s/\N{LATIN SMALL LETTER SHARP S}/ss/g; return uc($s); } sub normalize_name { my ($forenames, $surname, $sloppiness) = @_; my $name = "$forenames $surname"; if ($sloppiness > 1) { # try without punctuation $name =~ s/\.//g; } if ($sloppiness == 3) { # try single firstname only $name =~ s/^([^ ]+).*$/$1/; $name = "$name $surname"; } if ($sloppiness == 4) { # try first initial and normalized surname (last resort) $name = substr($forenames, 0, 1) . ' ' . normalize_string($surname); } return $name; } # read Research_Students table from SQLServer database my $dbh = DBI->connect('dbi:Sybase:server=www.ad.cl.cam.ac.uk', 'linuxscript', PasswordVault::fetch_password('linuxscript', 'www.ad.cl.cam.ac.uk'), {RaiseError => 1}); $dbh->do("use Administration"); my $sql = 'SELECT CRSID, Status, Surname, [First Name], Title, Submitted, Approved, Coll, Sup, Funds FROM Research_Students'; my $sth = $dbh->prepare($sql); $sth->execute(); my %status; my %surname; my %forenames; my %title; my %submissiondate; my %approvaldate; my %college; my %supervisor; my %funds; my %comparename; while (my @row=$sth->fetchrow_array() ){ map { $_ = decode('CP1252', $_); s/\s+$//; } @row; my ($crsid) = @row; $crsid = lc($crsid); #print join('|', @row)."\n"; #warn("$0: '$forenames{$crsid} $surname{$crsid}' appears in Research_Students without CRSID!\n") if !$crsid; #warn("$0: CRSID $crsid ($forenames{$crsid} $surname{$crsid}) reappears in Research_Students for @row[3] @row[2], contact $studentadmin!\n") if $crsid && defined($surname{$crsid}); $status{$crsid} = @row[1]; $surname{$crsid} = @row[2]; $forenames{$crsid} = @row[3]; $title{$crsid} = @row[4]; $submissiondate{$crsid} = normalize_date(@row[5]); $approvaldate{$crsid} = normalize_date(@row[6]); $college{$crsid} = @row[7]; $supervisor{$crsid} = @row[8]; $funds{$crsid} = @row[9]; my $surname = $surname{$crsid}; $surname =~ s/-/ /g; $comparename{$crsid} = substr($forenames{$crsid}, 0, 1) . ' ' . uc($surname); } # read technical-report database my $dbfile = "/anfs/www/html/techreports/tr-database.txt"; my %tr = TechReport::load($dbfile); delete $tr{209}; # HACK: supress the earlier "Andrew William Moore" # sort research students by submission date my @phds = sort( { -("$submissiondate{$a},$approvaldate{$a},$surname{$a},$forenames{$a}" cmp "$submissiondate{$b},$approvaldate{$b},$surname{$b},$forenames{$b}") } keys %surname); # do not consider non-PhD TRs and those that predate the CL degree committee my @trs = (sort { $a->{nr} <=> $b->{nr} } grep { Scalar::Util::blessed($_) and $_->is_phd and !$_->submitted_before('1987')} values %tr); grep { @{$_->{authors}} > 1 and warn("Multiple authors in TR-".$_->{nr}."!\n") } @trs; # match PhDs with technical reports by comparing author names my %matching_tr; for (my $sloppiness = 0; $sloppiness <= 4; $sloppiness++) { for my $crsid (reverse @phds) { my @matches = (); my $phd_author = normalize_name($forenames{$crsid}, $surname{$crsid}, $sloppiness); $phd_author = $rename{$phd_author} if exists $rename{$phd_author}; my $phd_title = $title{$crsid}; @trs = grep { !defined $_->{crsid} } @trs; for my $tr (@trs) { my $tr_author = normalize_name($tr->{authors}->[0]->forenames, $tr->{authors}->[0]->surname, $sloppiness); if ($sloppiness == 0) { my $tr_title = $tr->{'title'}; #print STDERR "$sloppiness: $phd_title - $tr_title\n" if # $phd_author =~ /Moore/i && $tr_author =~ /Moore/i; next unless $phd_title eq $tr_title; } #print STDERR "$sloppiness: $phd_author - $tr_author\n" if #$phd_author =~ /emens/i && $tr_author =~ /emens/i; next unless $phd_author eq $tr_author; # found a match push @matches, $tr->{'nr'}; } if (@matches) { $matching_tr{$crsid} = $matches[0]; $tr{$matches[0]}->{'crsid'} = $crsid; warn("Multiple equal-ranking TR matches for PhD of $crsid: " . join(', ', @matches) . "\n") if @matches > 1; } } } # check whether all PhD TRs have been covered my @unmatched = grep { (!$_->{crsid} # not found in Research_Students table and !$_->submitted_before('1993') # not pre-1993 (when we still had # non-CL Degree Committee PhDs) and !$dtg{($_->{authors}->[0]->fullname)}) # not an Engineering student } @trs; if (@unmatched) { warn("Warning: the following technical reports are listed in the data file\n\n $dbfile\n\nas being Computer Laboratory PhDs, but the script\n\n $0\n\ncan find no corresponding entry in the Administration/Research_Students table\non the departmental SQLserver:\n\n"); foreach my $tr (@unmatched) { warn(" - UCAM-CL-TR-".$tr->{nr}." by " . join(', ', map { $_->fullname } @{$tr->{authors}}) . " (submitted " . $tr->submitted . ")\n"); } warn("\nPossible reasons:\n\n - name not spelled correctly in one of the sources\n (contact $studentadmin, investigate and fix the respective data source)\n - name changed deliberately since registration\n (workaround: edit the hash table %rename in $0)\n") } # output HTML page my $fn; if ($fn = @ARGV[0]) { open(F, '>:utf8', $fn); } else { open(F, '>-:utf8'); } print F "\n"; print F ('\n"); print F <List of PhD theses

List of PhD theses

This is a list of all the PhD theses so far recommended by the Computer Science Degree Committee to the Board of Graduate Studies for approval (which can in some cases mean that there are still corrections to be made before final approval). Fully approved Cambridge PhDs are listed in the University Library thesis catalog.

All authors are encouraged to publish their approved thesis as a Computer Laboratory Technical Report. Where available, the table below links to that public tech-report version of the thesis and/or the local PDF version collected by $studentadmin in $local_pdfs. (The latter files are only accessable to members of the Computer Laboratory and may in some cases not yet contain any final corrections required by the examiners.)

Note: If a Computer Laboratory research student's PhD is not listed here, then it might have been dealt with by the Mathematics Degree Committee (this includes any pre-1987 Computer Laboratory PhD) or by the Engineering Degree Committee (this applies to some early ones in the Digital Technology Group).

Data source: The table below is automatically generated from the Research_Students table on the departmental SQL server, maintained by $studentadmin, to whom enquiries about these records should be addressed.
authortitlesubmittedapprovedthesis EOT foreach my $c (@phds) { next if $funds{$c} eq 'CUED'; # DTG -> Engineering Department Deg. Com. next if $status{$c} ne 'approved'; my $year = uc(substr($approvaldate{$c}, 0, 1)); my @thesis; if (exists $matching_tr{$c}) { push @thesis, ("TR-$matching_tr{$c}"); } for my $file ("${c}_thesis.pdf") { if (-f "$local_pdfs/$file") { push @thesis, ("local PDF"); } } my $author = "$forenames{$c} $surname{$c}"; if (-r "/homes/$c/public_html/index.html") { $author = "$author"; } print F "
$author" . "$title{$c}\n"; print F "$approvaldate{$c}" . join(', ', @thesis) . "\n"; } print F "
\n"; close(F); if (defined $fn) { # reformat the page with ucampas; my $ucampas = '/anfs/www/tools/bin/ucampas'; $ucampas = 'ucampas' unless -x $ucampas; `$ucampas -q '$fn'`; }