Rprog5.R


#------------------------------------------------------------------------------------

# NOTE. This is a semi-live transcript. It has many blind alleys and backtracks.
# I wouldn't normally keep everything here; I'd edit it as I go along, and be left
# with a single polished script. You can see the polished script at
# the end, section 5.

#------------------------------------------------------------------------------------

# PROBLEM: the interactive whiteboard program CHALK is very slow at synchronizing
# a new device. Why might this be?
#
# Obviously we need to know a bit about how CHALK works. There is a
# network of nodes, each node running an instance of CHALK. Each node
# wants to get the entire contents of the whiteboard. It does this in
# two ways. (1) when you write something, it is sent immediately as a
# 'live' message to everyone. (2) Periodically, a node will send an
# IHAVE message to one of its neighbours with a checksum of what it
# has, and if the neighbour has something different it will reply with
# IHAVE, and so on, until they work out what to send. Once a node
# works out what to send, it sends a GOSSIPSTART message, then a
# number of GOSSIP messages, then a final IHAVE message.
#
# Every message in CHALK is sent over UDP (i.e. without
# reliability). It relies on the fact that each node has multiple
# neighbours, and each neighbour periodically initiates IHAVEs, to
# achieve reliability. Messages are sent in packets; if the message is
# longer than 1024 bytes it is fragmented into multiple packets.
#
# I have collected detailed log files from CHALK. These log files
# record all messages sent and received. Each message has an ID, and
# if the message was sent in reply to an earlier message then it also
# contains the ID of the message it is replying to. The packet
# contains these IDs.


#------------------------------------------------------------------------------------
# 1. Data import and cleaning.

# Load in the raw data from one of the log files
x <- readLines('log-chalk-sakoa.txt')
# Have a look at typical rows.
x[1:10]

# The output is too long. Truncate it.
substr(x[1:50],1,80)

# The log lines that record packets sent/received all have the form
#    (time) [id]->hostname: [id>replyto]MSG(args)
#    (time) Received from hostname: [id>replyto]MSG(args)
# There are plenty of other log lines recording the state, but we'll
# only look at messages for now.

# I want to extract all the packets sent & received, and I want to put
# them into a data frame with separate columns for time, id, replyto,
# msg, args.

# Pick out the pkt-logging lines, which begin with "(timestamp)"
xt <- grep('\\([0-9.]+\\)',x, value=TRUE)
# Work out which are "Received from" lines.
# Split apart the "Received from" lines, and the sending lines.
r <- grep('Received from',xt)
xt <-
  gsub('\\(([0-9.]+)\\) Received from ([^:]+): \\[([-0-9]+)>?([-0-9]+)?\\]([[:alpha:]]+)(.*)',
       '\\1---\\2---\\3---\\4---\\5---\\6', xt)
xt <-
  gsub('\\(([0-9.]+)\\) \\[[-0-9]+\\]->([^:]+): \\[([-0-9]+)>?([-0-9]+)?\\]([[:alpha:]]+)(.*)',
       '\\1---\\2---\\3---\\4---\\5---\\6', xt)
xt <- strsplit(xt,'---')
log <- data.frame(time=sapply(xt,'[',1),
                  host=sapply(xt,'[',2),
                  id=sapply(xt,'[',3), repl=sapply(xt,'[',4),
                  msg=sapply(xt,'[',5), pars=sapply(xt,'[',6))
# r is a vector of integers, indexing which lines are received
# packets. I want a column called dir with 'in' or 'out'.
log$dir <- 'out'
log$dir[r] <- 'in'
log$dir <- factor(log$dir) # I'll probably want to tabulate by this
                           # later, so make it a factor.

# Check if the data frame is sensible.
summary(log)

# The pars column has silly long strings. summary is showing me some
# of these silly long strings because it things the column is a
# factor. I'll mark it a character.

log$pars <- as.character(log$pars)
summary(log)

# It's treated time as a factor. (This is because time came from a
# grep, which returns character vectors; data.frame by default turns character
# into factor.) Fix it to be a number.
# (Remember to convert to character then to numeric -- otherwise we'd
# just get the factor codes, not the numerical equivalents.)
log$time <- as.numeric(as.character(log$time))
summary(log)

# Now it all looks nice.

# We might as well do all this data cleaning in a single step.
# I marked the levels of dir explicitly, so it knows that 'in' is a
# valid level of the factor, because otherwise I'd get an error when I
# try to assign log$dir[r] <- 'in'. Also, the command I(.) means "keep
# this column as is", i.e. as a character vector.
log <- data.frame(time=as.numeric(sapply(xt,'[',1)),
                  host=sapply(xt,'[',2), dir=factor('out',levels=c('in','out')),
                  id=sapply(xt,'[',3), repl=sapply(xt,'[',4),
                  msg=sapply(xt,'[',5), pars=I(sapply(xt,'[',6)))
log$dir[r] <- 'in'


#------------------------------------------------------------------------------------
# 2. Initial plots/exploration

# Quick inspection of the data.
log[1:5,]

# The output is too long to read. The column 'pars' is the problem --
# it's too long. Work out which column it is, and suppress it.
names(log) # pars is in column 5
log[1:15,-6]


# Now I want to plot some simple basic diagrams, to see what's going
# on.

# Putting 1:nrow(log) on the y-axis simply mneans that each point goes
# on its own line.
library(lattice)
xyplot(1:nrow(log)~time, groups=msg, data=log)

# We see it goes up for a burst, then plateaus. The plateaus must be
# the time-outs that are killing performance.

# Let's look more closely at each type of message. I want to plot
# 'cumulative total number of packets of a given type' on the y
# axis. Rather than explicitly splitting the data frame myself and
# calculating cumulative totals, it's easier to do this within the
# plot command. If I use panel=panel.superpose and then define an
# extra argument panel.groups, I can customize how each group is
# plotted.

xyplot(1~time, groups=msg, data=log,
       panel=panel.superpose,
       panel.groups = function(x,y,...) panel.xyplot(x,1:length(x),...)
       )

# The plot looks daft. The problem is that the formula 1~time makes
# xyplot think that the range of y-values is [1,1]. I need to override
# its choice of y axis. I could do this elegantly, but I'll just do it
# as an explicit kludge. 

xyplot(1~time, groups=msg, data=log,
       ylim=c(0,3000),
       panel=panel.superpose,
       panel.groups = function(x,y,...) panel.xyplot(x,1:length(x),...)
       )

# I need a key, to see which points are which.

xyplot(1~time, groups=msg, data=log,
       ylim=c(0,3000),
       panel=panel.superpose,
       panel.groups = function(x,y,...) panel.xyplot(x,1:length(x),...),
       auto.key=TRUE
       )

# It's the GOSSIP messages that clearly show the punctuated bursts.
# Let's just plot the GOSSIP messages. Also, remember they can come in
# two directions, inbound and outbound.

xyplot(1~time|dir, data=log, subset=msg=='GOSSIP',
       ylim=c(0,1800),
       panel=function(x,y,...) panel.xyplot(x,1:length(x),...)
       )

# It's not plotting anything! Looking at the plot, we see it has
# chosen a stupid x-axis. It must be a silliness because of the
# formula I have it, 1~time. I guess it just wants some variability in
# the y values. Let's just force it to use a decent x axis.

xyplot(1~time|dir, data=log, subset=msg=='GOSSIP',
       ylim=c(0,1800), xlim=range(log$time),
       panel=function(x,y,...) panel.xyplot(x,1:length(x),...)
       )


# It's obviously the case that whatever problems we're having should
# be visible in the incoming GOSSIP messages. I want to work out why
# they have these plateaux (timeouts).

# Now is a good time to actually inspect one of times when things are
# going wrong. Let's first of all zoom in on the graph, so we know
# what to look for.

xyplot(1~time, data=log, subset=msg=='GOSSIP' & dir=='in',
       ylim=c(0,400), xlim=c(16,22),
       panel=function(x,y,...) panel.xyplot(x,1:length(x),...)
       )

log[log$time>16 & log$time<22,-6]


#------------------------------------------------------------------------------------
# 3. Re-do data import

# After getting a too-long listing, I think it would be helpful to
# inspect the raw log file at this timepoint, to figure out what's
# going on. It would be good to keep the original line number, so I
# can easily cross-reference a weirdness in the dataframe against the
# original log file.

# Let me rewrite the import routine.
x <- readLines('log-chalk-sakoa.txt')
xtn <- grep('\\([0-9.]+\\)',x)  # keep track of the line numbers that
                                # refer to packet send/recv.
xt <- x[xtn]
r <- grep('Received from',xt)
xt <-
  gsub('\\(([0-9.]+)\\) Received from ([^:]+): \\[([-0-9]+)>?([-0-9]+)?\\]([[:alpha:]]+)(.*)',
       '\\1---\\2---\\3---\\4---\\5---\\6', xt)
xt <-
  gsub('\\(([0-9.]+)\\) \\[[-0-9]+\\]->([^:]+): \\[([-0-9]+)>?([-0-9]+)?\\]([[:alpha:]]+)(.*)',
       '\\1---\\2---\\3---\\4---\\5---\\6', xt)
xt <- strsplit(xt,'---')
log <- data.frame(linenum=xtn, # add in line numbers to the log data frame
                  time=as.numeric(sapply(xt,'[',1)),
                  host=sapply(xt,'[',2), dir=factor('out', levels=c('in','out')),
                  id=sapply(xt,'[',3), repl=sapply(xt,'[',4),
                  msg=sapply(xt,'[',5), pars=I(sapply(xt,'[',6)))
log$dir[r] <- 'in'


# Let's look again for a plateau, this time keeping track of the line
# numbers. I'm only looking at incoming GOSSIP messages for the time
# being, so I'll create a new data frame with just these. That way I
# don't have to mess around with customizing my panel function, for
# graphics.

gin <- log[log$dir=='in' & log$msg=='GOSSIP',]
gin[1:15,]

# Find a plateau.
xyplot(linenum~time, data=gin)

xyplot(linenum~time, data=gin,
       xlim=c(16,22), ylim=c(0,1300) )

# The plateau is happening from time ~17 to ~21, at line number ~840.
# Let's look more carefully at the log file around line 840.

substr(x[800:900], 1,100)

# I want to print it with the actual line number.

paste(1:length(x), substr(x,1,100))[800:900]

# Looking around line 830, I see two things:
# (1) there are other messages going on during the plateau, e.g. PING
# (2) there are lots of log lines that say 'Message already in store'
# or 'message fragment already received'. This suggests a more
# systematic failure of communication, of which the plateaux are just
# a symptom.

# Let's see how many of these defective communications there are,
# i.e. how freqently the log file has these 'already' lines

grep('already',x,value=TRUE)[1:20]

# Tabulate: how many lines say 'already'.

xtabs(~grepl('already',x))

# A sizeable fraction (814 out of a total of 5841+814) are 'already'
# lines. There must be a huge anount of wasted communication going on here!
# Let's investiage further,

# Looking at the log file, the 'already' lines come immediately after
# receipt of a GOSSIP message. Therefore I can work out which GOSSIP
# messages were followed by an 'already' line, i.e. which GOSSIP
# messages are duplicates of something that came earlier, with the following code.

log$dup <- (log$linenum+1) %in% grep('already',x)

# Let's see how many messages are dups.

xtabs(~dup+msg, data=log, subset= dir=='in')

# Shocking! More than half of all GOSSIP messages are duplicates. None
# of the other messages are duplicates.

# Let's look at these duplicate GOSSIP messages, and see how frequent
# they are, and how they're spread across the entire file. I'll plot
# cumulative number of duplicated GOSSIP on the y-axis, against
# cumulative number of GOSSIP on the x-axis.

df <- log[log$msg=='GOSSIP' & log$dir=='in',]
xyplot(cumsum(ifelse(dup,1,0))~1:nrow(df), data=df)

# It's a roughly straight line. In other words, drops are happening
# throughout the session, at a roughly constant rate.



#------------------------------------------------------------------------------------
# 4. Detective work: inferring causality

# So far we have discovered correlations, i.e. patterns in the
# data. We've seen that there are frequent drops throughout the entire
# session, and that there are timeout plateaux.
#
# Now for some detective work. Let's look at one drop in detail, and
# try to uncover the sequence of events that caused it.

# Show the first five drops

log[log$dup,][1:5,]

# Show the events that led up to the first few drops

log[log$linenum<=60,]

# As usual, output is too long, because of the pars column. Suppress it.
log[log$linenum<=60,-8]

# There is a bunch of GOSSIP duplicates in the "flight" of packets
# that are in reply to message number 3. Let's have a look at this
# flight in more detail.

log[(log$dir=='in' & log$repl==3) | (log$dir=='out' & log$id==3),-8]

# Maybe this was a badly chosen flight, since all the GOSSIP messages
# are duplicated. Obviously the IHAVE I sent on linenum=18 was a bad IHAVE,
# because it triggered the other end to send me a load of stuff I have
# already.

# Let's look at some more of these 'conversation threads'. A
# conversation thread consists of some GOSSIPCOMING, GOSSIP and IHAVE
# messages. Also, the message I sent that triggered the other side to
# send me these.

thread <- log$dir=='in' & (log$msg %begins% c('GOSSIP','IHAVE'))
thread <- thread | (log$dir=='out' & log$id %in% log$repl[thread])

log[thread & log$linenum<=150, -8]

# Two problems.
# (1) I think I would like to see a little bit more of the message
# contents.
# (2) The messages are ordered by time, but it might be easier to see
# them ordered by conversation thread.

log$par2 <- ifelse(nchar(log$pars)<=60,log$pars,paste(substr(log$pars,1,57),'...'))
# I want to order them by "flight number", i.e. msg ID for outgoing,
# repl ID for incoming; and then within each flight I want to order
# them by time.
log <- log[order(ifelse(log$dir=='out',log$id,log$repl),log$time),]

# Inspect the output

log[1:20,-8]

# This isn't what I asked for! Why hasn't it ordered them how I want?
# Let's look at log to find out.

summary(log)

# Ah. id and repl are factors, not numbers. When you sort by a factor,
# you sort by the levels of the factor, and R might have chosen any
# which levels for these two factors. Solution: use numbers, not
# factors, for packet IDs.

# WRONG! BAD! NEVER TURN A FACTOR INTO A NUMBER BY as.numeric, SINCE
# THAT GIVES YOU THE FACTOR CODE, NOT THE VALUE OF THE FACTOR LEVEL.
# log$id <- as.numeric(log$id)
# log$repl <- as.numeric(log$repl)

log$id <- as.numeric(as.character(log$id))
log$repl <- as.numeric(as.character(log$repl))

# Now we can re-order the data and inspect it.
log <- log[order(ifelse(log$dir=='out',log$id,log$repl),log$time),]
log[1:20,-8]

# This is showing me lots of outbound messages with id=-1.
# These are in fact the GOSSIP messages that I'm sening out. But I
# have chosen to investigate the GOSSIP messages coming in, so let's
# ignore those.

log[log$dir!='out' | log$id>=0,][1:50,-8]


# Ah! I see! There are flights where say 7 fragments are promised and
# fewer than 7 are delivered. These "bad" flights have duplicate
# GOSSIP messages in the next flights.

# Let's therefore look at flight, i.e. GOSSIPCOMING, GOSSIP and IHAVE,
# all in reply to the same message. How much of the flight arrives?
# Does it match the promise in GOSSIPCOMING? Is there even a
# GOSSIPCOMING? Is there an IHAVE?

# I want to create several data frames, one for each of these pieces
# of information (numfrags promised, number of GOSSIPs, presence of
# IHAVE). I'll also add one for the flight starttime, in case it's
# useful later. I will then merge all these data frames together, so I
# end up with a dataframe called flights, with one row per flight.

# Extract all GOSSIPCOMING messages, and parse the pars column
f1 <- log[log$dir=='in' & log$msg=='GOSSIPCOMING',c('repl','pars')]
f1$numfrags <- as.numeric(substr(f1$pars,nchar('(numfrags=.'),nchar(f1$pars)-1))
f1$pars <- NULL

# Tabulate how many GOSSIP messages there are for each flight
f2 <- xtabs(~repl, data=log, subset=dir=='in' & msg=='GOSSIP')
f2 <- data.frame(repl=as.numeric(names(f2)),numgossip=as.vector(f2))

# Extract all IHAVE messages, and put reply ID in one column, TRUE in
# the other. (reply IDs without an IHAVE will not appear in this data
# frame; they'll come in later when we merge.)
f3 <- log[log$dir=='in' & log$msg=='IHAVE' & !is.na(log$repl),'repl']
f3 <- data.frame(repl=f3,ihave=TRUE)

# For each flight, find the minimum timestamp
library(djwutils)
f4 <- xxtabs(time~repl, data=log, FUN=min, subset=dir=='in' & msg %in% c('GOSSIPCOMING','GOSSIP','IHAVE') & !is.na(log$repl))
f4 <- as.data.frame(f4)

# Merge all of these together. Merge by the 'repl' column, i.e. the
# flight number. Use the option all=TRUE so that even if one of these
# data frames doesn't have the flight number (e.g. if a flight is
# missing its IHAVE, then it won't appear in f4), the row will still
# appear in our final list. It'll have an NA; fix up the NA.
flights <- merge(f1,f2,by='repl',all=TRUE)
flights <- merge(flights,f3,by='repl',all=TRUE)
flights <- merge(flights,f4,by='repl',all=TRUE)
flights$ihave <- !is.na(flights$ihave)

# Inspect the outcome.

flights$bad <- is.na(flights$numfrags) |
    flights$numgossip<flights$numfrags | !flights$ihave

flights[1:25,]


# We can see that there are many bad flights, i.e. flights with fragments which
# aren't getting through. This is the basic problem.

# Also, our plateau from 16 to 21: it's probably because the IHAVE
# didn't get through, so there was no reply, so we had to timeout.
# But those plateaux are just a sympton, the real problem is with the
# drops themselves.


#------------------------------------------------------------------------------------
# 5. Going meta.

# We have learnt that many flights aren't getting through properly,
# This means the flight has to be retransmitted, either fully or in
# part, and it can lead to timeouts if the final IHAVE is dropped.

# Why are so many packets being dropped? The overall data rate seems
# fairly small.

df <- log[log$dir=='in',]
nrow(df)/max(df$time) # average of 17 packets per second.
nrow(df)/max(df$time) * 1500 * 8 / 1000  # 203 kb/s

# At only 203 kbit/s, surely there shouldn't be this much loss.
# I'm running this experiment on a wired LAN at UCL, after all.

# We need to try to learn about why there are so many drops. There
# aren't enough flights in a single log file for me to learn much, and
# it'd be good to study a range of different settings (office, home,
# localhost). Let's wrap up all the derivation in a single
# function, so we can apply it to a collection of log files.

# This function returns list(log,flights). Both may be useful, so we
# might as well keep them.
getflights <- function(x) {
  xtn <- grep('\\([0-9.]+\\)',x)
  xt <- x[xtn]
  r <- grep('Received from',xt)
  xt <- gsub('\\(([0-9.]+)\\) Received from ([^:]+): \\[([-0-9]+)>?([-0-9]+)?\\]([[:alpha:]]+)(.*)', '\\1---\\2---\\3---\\4---\\5---\\6', xt)
  xt <- gsub('\\(([0-9.]+)\\) \\[[-0-9]+\\]->([^:]+): \\[([-0-9]+)>?([-0-9]+)?\\]([[:alpha:]]+)(.*)', '\\1---\\2---\\3---\\4---\\5---\\6', xt)
  xt <- strsplit(xt,'---')
  log <- data.frame(linenum=xtn,
                    time=as.numeric(sapply(xt,'[',1)),
                    host=sapply(xt,'[',2), dir=factor('out', levels=c('in','out')),
                    id=as.numeric(sapply(xt,'[',3)), repl=as.numeric(sapply(xt,'[',4)),
                    msg=sapply(xt,'[',5), pars=I(sapply(xt,'[',6)))
  log$dir[r] <- 'in'
  #
  f1 <- log[log$dir=='in' & log$msg=='GOSSIPCOMING',c('repl','pars')]
  f1$numfrags <- as.numeric(substr(f1$pars,nchar('(numfrags=.'),nchar(f1$pars)-1))
  f1$pars <- NULL
  f2 <- xtabs(~repl, data=log, subset=dir=='in' & msg=='GOSSIP')
  f2 <- data.frame(repl=as.numeric(names(f2)),numgossip=as.vector(f2))
  f3 <- log[log$dir=='in' & log$msg=='IHAVE' & !is.na(log$repl),'repl']
  f3 <- data.frame(repl=f3,ihave=TRUE)
  f4 <- xxtabs(time~repl, data=log, FUN=list(time=min),
               subset=dir=='in' & msg %in% c('GOSSIPCOMING','GOSSIP','IHAVE') & !is.na(log$repl))
  f4 <- as.data.frame(f4)
  flights <- merge(f1,f2,by='repl',all=TRUE)
  flights <- merge(flights,f3,by='repl',all=TRUE)
  flights <- merge(flights,f4,by='repl',all=TRUE)
  flights$ihave <- !is.na(flights$ihave)
  flights$numgossip[is.na(flights$numgossip)] <- 0
  #
  list(log=log,flights=flights)
}


# Let's look first at overall data rate (kbit/s) for each of the log files.

lognames <- grep('^log-chalk-.*\\.txt$',dir(),value=TRUE)
y <- lapply(lognames, function(nn) cbind(getflights(readLines(nn))$log,file=nn))
y <- do.call('rbind',y)
as.data.frame(xxtabs(time~file, data=y, subset=dir=='in',
       FUN=list(pkts=length,time=max,rate=function(x) (length(x)-1)/(max(x)-min(x)) *1500*8/1000)))

# The rates aren't very high. Even on a purely local transaction
# (calur), the data rate is 92 kbit/s, which is tiny. So I'm
# definitely not overwhelming the Internet!


# Let's look at number of bad flights.

lognames <- grep('^log-chalk-.*\\.txt$',dir(),value=TRUE)
x <- lapply(lognames, function(nn) cbind(getflights(readLines(nn))$flights,file=nn))
x <- do.call('rbind',x)
x$bad <- is.na(x$numfrags) | x$numgossip<x$numfrags | !x$ihave

# I wonder if the problem of bad flights has anything to do with
# flight length?

histogram(~numfrags|bad, data=x, subset=!is.na(numfrags))

# YES!

# A stacked histogram would be nice. A quick Internet search gave me
# code, which I pasted into utils.R.

source('utils.R') # stacked histogram

histogram(~numfrags, groups=bad, data=x, subset=!is.na(numfrags),
          panel=panel.grouped.histogram,
          auto.key=TRUE)

# Clearly, longer flights tend to be bad, and shorter flights have a
# good chance of getting through.

# Let's see how this differs between log files.
# (And smarten up the legend.)

histogram(~numfrags|file, groups=factor(ifelse(bad,'failed','ok'),levels=c('ok','failed')),
          data=x, subset=!is.na(numfrags),
          panel=panel.grouped.histogram,
          auto.key=TRUE)

# Let's look with more granularity at small flights.

histogram(~numfrags|file, groups=factor(ifelse(bad,'failed','ok'),levels=c('ok','failed')),
          data=x, subset=!is.na(numfrags) & numfrags<=50, breaks=seq(0,50,by=5),
          panel=panel.grouped.histogram,
          auto.key=TRUE)




# CONCLUSION.
#
# Communication goes in "flights", consisting of GOSSIPCOMING, several
# GOSSIP, and IHAVE. If some of those packets are dropped, the flight
# is bad and has to be retransmitted. If the IHAVE is dropped, there
# is a timeout.
#
# The overall data rates are low, so we do not expect the packet drop
# rate to be high. However, we see that longer flights are much more
# likely to be dropped. This suggests that buffers are small, and the
# packets in a flight are all injected into the buffer at the same time.
# I estimate that available buffer is approx. 15 packets, since
# flights of size <=15 are likely to get through.

# SOLUTION
# Add congestion control, or pace packet transmission.