#------------------------------------------------------------------------------------ # NOTE. This is a semi-live transcript. It has many blind alleys and backtracks. # I wouldn't normally keep everything here; I'd edit it as I go along, and be left # with a single polished script. You can see the polished script at # the end, section 5. #------------------------------------------------------------------------------------ # PROBLEM: the interactive whiteboard program CHALK is very slow at synchronizing # a new device. Why might this be? # # Obviously we need to know a bit about how CHALK works. There is a # network of nodes, each node running an instance of CHALK. Each node # wants to get the entire contents of the whiteboard. It does this in # two ways. (1) when you write something, it is sent immediately as a # 'live' message to everyone. (2) Periodically, a node will send an # IHAVE message to one of its neighbours with a checksum of what it # has, and if the neighbour has something different it will reply with # IHAVE, and so on, until they work out what to send. Once a node # works out what to send, it sends a GOSSIPSTART message, then a # number of GOSSIP messages, then a final IHAVE message. # # Every message in CHALK is sent over UDP (i.e. without # reliability). It relies on the fact that each node has multiple # neighbours, and each neighbour periodically initiates IHAVEs, to # achieve reliability. Messages are sent in packets; if the message is # longer than 1024 bytes it is fragmented into multiple packets. # # I have collected detailed log files from CHALK. These log files # record all messages sent and received. Each message has an ID, and # if the message was sent in reply to an earlier message then it also # contains the ID of the message it is replying to. The packet # contains these IDs. #------------------------------------------------------------------------------------ # 1. Data import and cleaning. # Load in the raw data from one of the log files x <- readLines('log-chalk-sakoa.txt') # Have a look at typical rows. x[1:10] # The output is too long. Truncate it. substr(x[1:50],1,80) # The log lines that record packets sent/received all have the form # (time) [id]->hostname: [id>replyto]MSG(args) # (time) Received from hostname: [id>replyto]MSG(args) # There are plenty of other log lines recording the state, but we'll # only look at messages for now. # I want to extract all the packets sent & received, and I want to put # them into a data frame with separate columns for time, id, replyto, # msg, args. # Pick out the pkt-logging lines, which begin with "(timestamp)" xt <- grep('\\([0-9.]+\\)',x, value=TRUE) # Work out which are "Received from" lines. # Split apart the "Received from" lines, and the sending lines. r <- grep('Received from',xt) xt <- gsub('\\(([0-9.]+)\\) Received from ([^:]+): \\[([-0-9]+)>?([-0-9]+)?\\]([[:alpha:]]+)(.*)', '\\1---\\2---\\3---\\4---\\5---\\6', xt) xt <- gsub('\\(([0-9.]+)\\) \\[[-0-9]+\\]->([^:]+): \\[([-0-9]+)>?([-0-9]+)?\\]([[:alpha:]]+)(.*)', '\\1---\\2---\\3---\\4---\\5---\\6', xt) xt <- strsplit(xt,'---') log <- data.frame(time=sapply(xt,'[',1), host=sapply(xt,'[',2), id=sapply(xt,'[',3), repl=sapply(xt,'[',4), msg=sapply(xt,'[',5), pars=sapply(xt,'[',6)) # r is a vector of integers, indexing which lines are received # packets. I want a column called dir with 'in' or 'out'. log$dir <- 'out' log$dir[r] <- 'in' log$dir <- factor(log$dir) # I'll probably want to tabulate by this # later, so make it a factor. # Check if the data frame is sensible. summary(log) # The pars column has silly long strings. summary is showing me some # of these silly long strings because it things the column is a # factor. I'll mark it a character. log$pars <- as.character(log$pars) summary(log) # It's treated time as a factor. (This is because time came from a # grep, which returns character vectors; data.frame by default turns character # into factor.) Fix it to be a number. # (Remember to convert to character then to numeric -- otherwise we'd # just get the factor codes, not the numerical equivalents.) log$time <- as.numeric(as.character(log$time)) summary(log) # Now it all looks nice. # We might as well do all this data cleaning in a single step. # I marked the levels of dir explicitly, so it knows that 'in' is a # valid level of the factor, because otherwise I'd get an error when I # try to assign log$dir[r] <- 'in'. Also, the command I(.) means "keep # this column as is", i.e. as a character vector. log <- data.frame(time=as.numeric(sapply(xt,'[',1)), host=sapply(xt,'[',2), dir=factor('out',levels=c('in','out')), id=sapply(xt,'[',3), repl=sapply(xt,'[',4), msg=sapply(xt,'[',5), pars=I(sapply(xt,'[',6))) log$dir[r] <- 'in' #------------------------------------------------------------------------------------ # 2. Initial plots/exploration # Quick inspection of the data. log[1:5,] # The output is too long to read. The column 'pars' is the problem -- # it's too long. Work out which column it is, and suppress it. names(log) # pars is in column 5 log[1:15,-6] # Now I want to plot some simple basic diagrams, to see what's going # on. # Putting 1:nrow(log) on the y-axis simply mneans that each point goes # on its own line. library(lattice) xyplot(1:nrow(log)~time, groups=msg, data=log) # We see it goes up for a burst, then plateaus. The plateaus must be # the time-outs that are killing performance. # Let's look more closely at each type of message. I want to plot # 'cumulative total number of packets of a given type' on the y # axis. Rather than explicitly splitting the data frame myself and # calculating cumulative totals, it's easier to do this within the # plot command. If I use panel=panel.superpose and then define an # extra argument panel.groups, I can customize how each group is # plotted. xyplot(1~time, groups=msg, data=log, panel=panel.superpose, panel.groups = function(x,y,...) panel.xyplot(x,1:length(x),...) ) # The plot looks daft. The problem is that the formula 1~time makes # xyplot think that the range of y-values is [1,1]. I need to override # its choice of y axis. I could do this elegantly, but I'll just do it # as an explicit kludge. xyplot(1~time, groups=msg, data=log, ylim=c(0,3000), panel=panel.superpose, panel.groups = function(x,y,...) panel.xyplot(x,1:length(x),...) ) # I need a key, to see which points are which. xyplot(1~time, groups=msg, data=log, ylim=c(0,3000), panel=panel.superpose, panel.groups = function(x,y,...) panel.xyplot(x,1:length(x),...), auto.key=TRUE ) # It's the GOSSIP messages that clearly show the punctuated bursts. # Let's just plot the GOSSIP messages. Also, remember they can come in # two directions, inbound and outbound. xyplot(1~time|dir, data=log, subset=msg=='GOSSIP', ylim=c(0,1800), panel=function(x,y,...) panel.xyplot(x,1:length(x),...) ) # It's not plotting anything! Looking at the plot, we see it has # chosen a stupid x-axis. It must be a silliness because of the # formula I have it, 1~time. I guess it just wants some variability in # the y values. Let's just force it to use a decent x axis. xyplot(1~time|dir, data=log, subset=msg=='GOSSIP', ylim=c(0,1800), xlim=range(log$time), panel=function(x,y,...) panel.xyplot(x,1:length(x),...) ) # It's obviously the case that whatever problems we're having should # be visible in the incoming GOSSIP messages. I want to work out why # they have these plateaux (timeouts). # Now is a good time to actually inspect one of times when things are # going wrong. Let's first of all zoom in on the graph, so we know # what to look for. xyplot(1~time, data=log, subset=msg=='GOSSIP' & dir=='in', ylim=c(0,400), xlim=c(16,22), panel=function(x,y,...) panel.xyplot(x,1:length(x),...) ) log[log$time>16 & log$time<22,-6] #------------------------------------------------------------------------------------ # 3. Re-do data import # After getting a too-long listing, I think it would be helpful to # inspect the raw log file at this timepoint, to figure out what's # going on. It would be good to keep the original line number, so I # can easily cross-reference a weirdness in the dataframe against the # original log file. # Let me rewrite the import routine. x <- readLines('log-chalk-sakoa.txt') xtn <- grep('\\([0-9.]+\\)',x) # keep track of the line numbers that # refer to packet send/recv. xt <- x[xtn] r <- grep('Received from',xt) xt <- gsub('\\(([0-9.]+)\\) Received from ([^:]+): \\[([-0-9]+)>?([-0-9]+)?\\]([[:alpha:]]+)(.*)', '\\1---\\2---\\3---\\4---\\5---\\6', xt) xt <- gsub('\\(([0-9.]+)\\) \\[[-0-9]+\\]->([^:]+): \\[([-0-9]+)>?([-0-9]+)?\\]([[:alpha:]]+)(.*)', '\\1---\\2---\\3---\\4---\\5---\\6', xt) xt <- strsplit(xt,'---') log <- data.frame(linenum=xtn, # add in line numbers to the log data frame time=as.numeric(sapply(xt,'[',1)), host=sapply(xt,'[',2), dir=factor('out', levels=c('in','out')), id=sapply(xt,'[',3), repl=sapply(xt,'[',4), msg=sapply(xt,'[',5), pars=I(sapply(xt,'[',6))) log$dir[r] <- 'in' # Let's look again for a plateau, this time keeping track of the line # numbers. I'm only looking at incoming GOSSIP messages for the time # being, so I'll create a new data frame with just these. That way I # don't have to mess around with customizing my panel function, for # graphics. gin <- log[log$dir=='in' & log$msg=='GOSSIP',] gin[1:15,] # Find a plateau. xyplot(linenum~time, data=gin) xyplot(linenum~time, data=gin, xlim=c(16,22), ylim=c(0,1300) ) # The plateau is happening from time ~17 to ~21, at line number ~840. # Let's look more carefully at the log file around line 840. substr(x[800:900], 1,100) # I want to print it with the actual line number. paste(1:length(x), substr(x,1,100))[800:900] # Looking around line 830, I see two things: # (1) there are other messages going on during the plateau, e.g. PING # (2) there are lots of log lines that say 'Message already in store' # or 'message fragment already received'. This suggests a more # systematic failure of communication, of which the plateaux are just # a symptom. # Let's see how many of these defective communications there are, # i.e. how freqently the log file has these 'already' lines grep('already',x,value=TRUE)[1:20] # Tabulate: how many lines say 'already'. xtabs(~grepl('already',x)) # A sizeable fraction (814 out of a total of 5841+814) are 'already' # lines. There must be a huge anount of wasted communication going on here! # Let's investiage further, # Looking at the log file, the 'already' lines come immediately after # receipt of a GOSSIP message. Therefore I can work out which GOSSIP # messages were followed by an 'already' line, i.e. which GOSSIP # messages are duplicates of something that came earlier, with the following code. log$dup <- (log$linenum+1) %in% grep('already',x) # Let's see how many messages are dups. xtabs(~dup+msg, data=log, subset= dir=='in') # Shocking! More than half of all GOSSIP messages are duplicates. None # of the other messages are duplicates. # Let's look at these duplicate GOSSIP messages, and see how frequent # they are, and how they're spread across the entire file. I'll plot # cumulative number of duplicated GOSSIP on the y-axis, against # cumulative number of GOSSIP on the x-axis. df <- log[log$msg=='GOSSIP' & log$dir=='in',] xyplot(cumsum(ifelse(dup,1,0))~1:nrow(df), data=df) # It's a roughly straight line. In other words, drops are happening # throughout the session, at a roughly constant rate. #------------------------------------------------------------------------------------ # 4. Detective work: inferring causality # So far we have discovered correlations, i.e. patterns in the # data. We've seen that there are frequent drops throughout the entire # session, and that there are timeout plateaux. # # Now for some detective work. Let's look at one drop in detail, and # try to uncover the sequence of events that caused it. # Show the first five drops log[log$dup,][1:5,] # Show the events that led up to the first few drops log[log$linenum<=60,] # As usual, output is too long, because of the pars column. Suppress it. log[log$linenum<=60,-8] # There is a bunch of GOSSIP duplicates in the "flight" of packets # that are in reply to message number 3. Let's have a look at this # flight in more detail. log[(log$dir=='in' & log$repl==3) | (log$dir=='out' & log$id==3),-8] # Maybe this was a badly chosen flight, since all the GOSSIP messages # are duplicated. Obviously the IHAVE I sent on linenum=18 was a bad IHAVE, # because it triggered the other end to send me a load of stuff I have # already. # Let's look at some more of these 'conversation threads'. A # conversation thread consists of some GOSSIPCOMING, GOSSIP and IHAVE # messages. Also, the message I sent that triggered the other side to # send me these. thread <- log$dir=='in' & (log$msg %begins% c('GOSSIP','IHAVE')) thread <- thread | (log$dir=='out' & log$id %in% log$repl[thread]) log[thread & log$linenum<=150, -8] # Two problems. # (1) I think I would like to see a little bit more of the message # contents. # (2) The messages are ordered by time, but it might be easier to see # them ordered by conversation thread. log$par2 <- ifelse(nchar(log$pars)<=60,log$pars,paste(substr(log$pars,1,57),'...')) # I want to order them by "flight number", i.e. msg ID for outgoing, # repl ID for incoming; and then within each flight I want to order # them by time. log <- log[order(ifelse(log$dir=='out',log$id,log$repl),log$time),] # Inspect the output log[1:20,-8] # This isn't what I asked for! Why hasn't it ordered them how I want? # Let's look at log to find out. summary(log) # Ah. id and repl are factors, not numbers. When you sort by a factor, # you sort by the levels of the factor, and R might have chosen any # which levels for these two factors. Solution: use numbers, not # factors, for packet IDs. # WRONG! BAD! NEVER TURN A FACTOR INTO A NUMBER BY as.numeric, SINCE # THAT GIVES YOU THE FACTOR CODE, NOT THE VALUE OF THE FACTOR LEVEL. # log$id <- as.numeric(log$id) # log$repl <- as.numeric(log$repl) log$id <- as.numeric(as.character(log$id)) log$repl <- as.numeric(as.character(log$repl)) # Now we can re-order the data and inspect it. log <- log[order(ifelse(log$dir=='out',log$id,log$repl),log$time),] log[1:20,-8] # This is showing me lots of outbound messages with id=-1. # These are in fact the GOSSIP messages that I'm sening out. But I # have chosen to investigate the GOSSIP messages coming in, so let's # ignore those. log[log$dir!='out' | log$id>=0,][1:50,-8] # Ah! I see! There are flights where say 7 fragments are promised and # fewer than 7 are delivered. These "bad" flights have duplicate # GOSSIP messages in the next flights. # Let's therefore look at flight, i.e. GOSSIPCOMING, GOSSIP and IHAVE, # all in reply to the same message. How much of the flight arrives? # Does it match the promise in GOSSIPCOMING? Is there even a # GOSSIPCOMING? Is there an IHAVE? # I want to create several data frames, one for each of these pieces # of information (numfrags promised, number of GOSSIPs, presence of # IHAVE). I'll also add one for the flight starttime, in case it's # useful later. I will then merge all these data frames together, so I # end up with a dataframe called flights, with one row per flight. # Extract all GOSSIPCOMING messages, and parse the pars column f1 <- log[log$dir=='in' & log$msg=='GOSSIPCOMING',c('repl','pars')] f1$numfrags <- as.numeric(substr(f1$pars,nchar('(numfrags=.'),nchar(f1$pars)-1)) f1$pars <- NULL # Tabulate how many GOSSIP messages there are for each flight f2 <- xtabs(~repl, data=log, subset=dir=='in' & msg=='GOSSIP') f2 <- data.frame(repl=as.numeric(names(f2)),numgossip=as.vector(f2)) # Extract all IHAVE messages, and put reply ID in one column, TRUE in # the other. (reply IDs without an IHAVE will not appear in this data # frame; they'll come in later when we merge.) f3 <- log[log$dir=='in' & log$msg=='IHAVE' & !is.na(log$repl),'repl'] f3 <- data.frame(repl=f3,ihave=TRUE) # For each flight, find the minimum timestamp library(djwutils) f4 <- xxtabs(time~repl, data=log, FUN=min, subset=dir=='in' & msg %in% c('GOSSIPCOMING','GOSSIP','IHAVE') & !is.na(log$repl)) f4 <- as.data.frame(f4) # Merge all of these together. Merge by the 'repl' column, i.e. the # flight number. Use the option all=TRUE so that even if one of these # data frames doesn't have the flight number (e.g. if a flight is # missing its IHAVE, then it won't appear in f4), the row will still # appear in our final list. It'll have an NA; fix up the NA. flights <- merge(f1,f2,by='repl',all=TRUE) flights <- merge(flights,f3,by='repl',all=TRUE) flights <- merge(flights,f4,by='repl',all=TRUE) flights$ihave <- !is.na(flights$ihave) # Inspect the outcome. flights$bad <- is.na(flights$numfrags) | flights$numgossip<flights$numfrags | !flights$ihave flights[1:25,] # We can see that there are many bad flights, i.e. flights with fragments which # aren't getting through. This is the basic problem. # Also, our plateau from 16 to 21: it's probably because the IHAVE # didn't get through, so there was no reply, so we had to timeout. # But those plateaux are just a sympton, the real problem is with the # drops themselves. #------------------------------------------------------------------------------------ # 5. Going meta. # We have learnt that many flights aren't getting through properly, # This means the flight has to be retransmitted, either fully or in # part, and it can lead to timeouts if the final IHAVE is dropped. # Why are so many packets being dropped? The overall data rate seems # fairly small. df <- log[log$dir=='in',] nrow(df)/max(df$time) # average of 17 packets per second. nrow(df)/max(df$time) * 1500 * 8 / 1000 # 203 kb/s # At only 203 kbit/s, surely there shouldn't be this much loss. # I'm running this experiment on a wired LAN at UCL, after all. # We need to try to learn about why there are so many drops. There # aren't enough flights in a single log file for me to learn much, and # it'd be good to study a range of different settings (office, home, # localhost). Let's wrap up all the derivation in a single # function, so we can apply it to a collection of log files. # This function returns list(log,flights). Both may be useful, so we # might as well keep them. getflights <- function(x) { xtn <- grep('\\([0-9.]+\\)',x) xt <- x[xtn] r <- grep('Received from',xt) xt <- gsub('\\(([0-9.]+)\\) Received from ([^:]+): \\[([-0-9]+)>?([-0-9]+)?\\]([[:alpha:]]+)(.*)', '\\1---\\2---\\3---\\4---\\5---\\6', xt) xt <- gsub('\\(([0-9.]+)\\) \\[[-0-9]+\\]->([^:]+): \\[([-0-9]+)>?([-0-9]+)?\\]([[:alpha:]]+)(.*)', '\\1---\\2---\\3---\\4---\\5---\\6', xt) xt <- strsplit(xt,'---') log <- data.frame(linenum=xtn, time=as.numeric(sapply(xt,'[',1)), host=sapply(xt,'[',2), dir=factor('out', levels=c('in','out')), id=as.numeric(sapply(xt,'[',3)), repl=as.numeric(sapply(xt,'[',4)), msg=sapply(xt,'[',5), pars=I(sapply(xt,'[',6))) log$dir[r] <- 'in' # f1 <- log[log$dir=='in' & log$msg=='GOSSIPCOMING',c('repl','pars')] f1$numfrags <- as.numeric(substr(f1$pars,nchar('(numfrags=.'),nchar(f1$pars)-1)) f1$pars <- NULL f2 <- xtabs(~repl, data=log, subset=dir=='in' & msg=='GOSSIP') f2 <- data.frame(repl=as.numeric(names(f2)),numgossip=as.vector(f2)) f3 <- log[log$dir=='in' & log$msg=='IHAVE' & !is.na(log$repl),'repl'] f3 <- data.frame(repl=f3,ihave=TRUE) f4 <- xxtabs(time~repl, data=log, FUN=list(time=min), subset=dir=='in' & msg %in% c('GOSSIPCOMING','GOSSIP','IHAVE') & !is.na(log$repl)) f4 <- as.data.frame(f4) flights <- merge(f1,f2,by='repl',all=TRUE) flights <- merge(flights,f3,by='repl',all=TRUE) flights <- merge(flights,f4,by='repl',all=TRUE) flights$ihave <- !is.na(flights$ihave) flights$numgossip[is.na(flights$numgossip)] <- 0 # list(log=log,flights=flights) } # Let's look first at overall data rate (kbit/s) for each of the log files. lognames <- grep('^log-chalk-.*\\.txt$',dir(),value=TRUE) y <- lapply(lognames, function(nn) cbind(getflights(readLines(nn))$log,file=nn)) y <- do.call('rbind',y) as.data.frame(xxtabs(time~file, data=y, subset=dir=='in', FUN=list(pkts=length,time=max,rate=function(x) (length(x)-1)/(max(x)-min(x)) *1500*8/1000))) # The rates aren't very high. Even on a purely local transaction # (calur), the data rate is 92 kbit/s, which is tiny. So I'm # definitely not overwhelming the Internet! # Let's look at number of bad flights. lognames <- grep('^log-chalk-.*\\.txt$',dir(),value=TRUE) x <- lapply(lognames, function(nn) cbind(getflights(readLines(nn))$flights,file=nn)) x <- do.call('rbind',x) x$bad <- is.na(x$numfrags) | x$numgossip<x$numfrags | !x$ihave # I wonder if the problem of bad flights has anything to do with # flight length? histogram(~numfrags|bad, data=x, subset=!is.na(numfrags)) # YES! # A stacked histogram would be nice. A quick Internet search gave me # code, which I pasted into utils.R. source('utils.R') # stacked histogram histogram(~numfrags, groups=bad, data=x, subset=!is.na(numfrags), panel=panel.grouped.histogram, auto.key=TRUE) # Clearly, longer flights tend to be bad, and shorter flights have a # good chance of getting through. # Let's see how this differs between log files. # (And smarten up the legend.) histogram(~numfrags|file, groups=factor(ifelse(bad,'failed','ok'),levels=c('ok','failed')), data=x, subset=!is.na(numfrags), panel=panel.grouped.histogram, auto.key=TRUE) # Let's look with more granularity at small flights. histogram(~numfrags|file, groups=factor(ifelse(bad,'failed','ok'),levels=c('ok','failed')), data=x, subset=!is.na(numfrags) & numfrags<=50, breaks=seq(0,50,by=5), panel=panel.grouped.histogram, auto.key=TRUE) # CONCLUSION. # # Communication goes in "flights", consisting of GOSSIPCOMING, several # GOSSIP, and IHAVE. If some of those packets are dropped, the flight # is bad and has to be retransmitted. If the IHAVE is dropped, there # is a timeout. # # The overall data rates are low, so we do not expect the packet drop # rate to be high. However, we see that longer flights are much more # likely to be dropped. This suggests that buffers are small, and the # packets in a flight are all injected into the buffer at the same time. # I estimate that available buffer is approx. 15 packets, since # flights of size <=15 are likely to get through. # SOLUTION # Add congestion control, or pace packet transmission.