session3.R

#------------------------------------------------
# 1. Creating and combining data frames
#
# Things to learn:
#    data.frame, to create new data frames
#    cbind, to add columns (an alternative to the $ notation)
#    rbind, to stack data frames on top of each other
#    do.call
#    runif

df1 <- data.frame(date=1:5, price=runif(5))   # create a data frame with two columns
df2 <- data.frame(date=1:5, price=runif(5))   # runif(5) generates 5 random numbers

df <- rbind(cbind(df1,commodity='fig'),       # add a column called commodity to each data frame,
            cbind(df2,commodity='apple'))     # then stack the two on top of each other
df

dfs <- list(cbind(df1,commodity='fig'),       # But what if you have a list of data frames to stack together, 
            cbind(df2,commodity='apple'))     # and the list could be any length?
df <- do.call('rbind', dfs)                   # does exactly the same as rbind(dfs[[1]],dfs[[2]],...)



#------------------------------------------------
# 2. Factors
#
# Things to learn:
#    summary
#    how factors look in a summary
#    creating factors, and specifying their levels
#    factors don't behave like strings
#    cut

summary(df)

df$commodity

x <- factor(c('blue','red','green','blue','green'),
            levels=c('red','green','blue','black'))
x

which(x=='green')

x[1] <- 'black'
x

x[1] <- 'orange'
x

c(x,'black')


cut(df$price, breaks=c(0,.25,.5,.75,1))



#------------------------------------------------
# 3. Tabulation
#
# Things to learn:
#   load, to load in a binary file
#   grepl, for text-parsing
#   xtabs, the formula notation, subset
#   xtabs returns a matrix
#   sort

load('../Data/weblogs.Rdata')  
summary(logs)  # print out a summary of all the columns
logs[1:5,1:6]  # print out the first 5 rows, the first 6 columns

# grepl returns a vector of TRUEs and FALSEs, according to whether the elements of a character vector
# match a given regular expression. Regular expressions are a common feature of many languages,
# and they're widely used for text parsing.
logs$searched <- grepl('(google)|(bing)|(yahoo)',logs$referrer)
logs$crawled <- grepl('(crawler)|(bot)|(slurp)',logs$browser)

logs[1:5, c('timestamp','request','url','status','searched','crawled')]

xtabs(~searched+crawled, data=logs)

# The first part of the url string is the owner of the web page.
# Whose web pages are most popular with searchers and crawlers?
logs$reqtype <- factor(ifelse(logs$searched,'search',ifelse(logs$crawled,'crawl','browse')))
logs$owner <- sub('(http:/)?/([^?/-]+).*','\\2',logs$url)
logs[1:5,c('timestamp','url','reqtype','owner')]

x <- xtabs(~owner+reqtype, data=logs, subset= owner %in% c('damon','lu','marcus'))
x

x['damon',]

x[,'browse']


# Whose web pages are the most popular?
# Count how many requests are for each owner of a page.
# Sort this, highest to lowest, and find the 20 most popular owners.
x <- xtabs(~owner, data=logs)
sort(x, decreasing=TRUE)[1:20]



#-------------------------------------------------
# 4. Aggregation commands
#
# Things to learn:
#    split, process, reassemble, join.
#    xxtabs, from package djwutils
#    merge


library(djwutils)
xxtabs(size~owner, data=logs, FUN=mean)  # Produces error messages. Size is a factor, not a number.

logs$size <- as.numeric(as.character(logs$size))  # Convert a factor to a number, via string

# Split by owner, and in each chunk take the size column, and compute `median` on it
x <- xxtabs(size~owner, data=logs, FUN=median, subset=owner %in% c('damon','lu','marcus'))
x
as.data.frame(x)

# Combine the first 200 rows of logs, with the median file size for each row's owner
logs2 <- merge(logs[1:200,c(1:6,12)], as.data.frame(x),
               by='owner')
logs2[1:10,]

# What are the first 10 log entries that are bigger than 10*median for that owner?
logs2[logs2$size/logs2$median>10,][1:10,]