#------------------------------------------------ # 1. Creating and combining data frames # # Things to learn: # data.frame, to create new data frames # cbind, to add columns (an alternative to the $ notation) # rbind, to stack data frames on top of each other # do.call # runif df1 <- data.frame(date=1:5, price=runif(5)) # create a data frame with two columns df2 <- data.frame(date=1:5, price=runif(5)) # runif(5) generates 5 random numbers df <- rbind(cbind(df1,commodity='fig'), # add a column called commodity to each data frame, cbind(df2,commodity='apple')) # then stack the two on top of each other df dfs <- list(cbind(df1,commodity='fig'), # But what if you have a list of data frames to stack together, cbind(df2,commodity='apple')) # and the list could be any length? df <- do.call('rbind', dfs) # does exactly the same as rbind(dfs[[1]],dfs[[2]],...) #------------------------------------------------ # 2. Factors # # Things to learn: # summary # how factors look in a summary # creating factors, and specifying their levels # factors don't behave like strings # cut summary(df) df$commodity x <- factor(c('blue','red','green','blue','green'), levels=c('red','green','blue','black')) x which(x=='green') x[1] <- 'black' x x[1] <- 'orange' x c(x,'black') cut(df$price, breaks=c(0,.25,.5,.75,1)) #------------------------------------------------ # 3. Tabulation # # Things to learn: # load, to load in a binary file # grepl, for text-parsing # xtabs, the formula notation, subset # xtabs returns a matrix # sort load('../Data/weblogs.Rdata') summary(logs) # print out a summary of all the columns logs[1:5,1:6] # print out the first 5 rows, the first 6 columns # grepl returns a vector of TRUEs and FALSEs, according to whether the elements of a character vector # match a given regular expression. Regular expressions are a common feature of many languages, # and they're widely used for text parsing. logs$searched <- grepl('(google)|(bing)|(yahoo)',logs$referrer) logs$crawled <- grepl('(crawler)|(bot)|(slurp)',logs$browser) logs[1:5, c('timestamp','request','url','status','searched','crawled')] xtabs(~searched+crawled, data=logs) # The first part of the url string is the owner of the web page. # Whose web pages are most popular with searchers and crawlers? logs$reqtype <- factor(ifelse(logs$searched,'search',ifelse(logs$crawled,'crawl','browse'))) logs$owner <- sub('(http:/)?/([^?/-]+).*','\\2',logs$url) logs[1:5,c('timestamp','url','reqtype','owner')] x <- xtabs(~owner+reqtype, data=logs, subset= owner %in% c('damon','lu','marcus')) x x['damon',] x[,'browse'] # Whose web pages are the most popular? # Count how many requests are for each owner of a page. # Sort this, highest to lowest, and find the 20 most popular owners. x <- xtabs(~owner, data=logs) sort(x, decreasing=TRUE)[1:20] #------------------------------------------------- # 4. Aggregation commands # # Things to learn: # split, process, reassemble, join. # xxtabs, from package djwutils # merge library(djwutils) xxtabs(size~owner, data=logs, FUN=mean) # Produces error messages. Size is a factor, not a number. logs$size <- as.numeric(as.character(logs$size)) # Convert a factor to a number, via string # Split by owner, and in each chunk take the size column, and compute `median` on it x <- xxtabs(size~owner, data=logs, FUN=median, subset=owner %in% c('damon','lu','marcus')) x as.data.frame(x) # Combine the first 200 rows of logs, with the median file size for each row's owner logs2 <- merge(logs[1:200,c(1:6,12)], as.data.frame(x), by='owner') logs2[1:10,] # What are the first 10 log entries that are bigger than 10*median for that owner? logs2[logs2$size/logs2$median>10,][1:10,]