```#------------------------------------------------
# 1. Creating and combining data frames
#
# Things to learn:
#    data.frame, to create new data frames
#    cbind, to add columns (an alternative to the \$ notation)
#    rbind, to stack data frames on top of each other
#    do.call
#    runif

df1 <- data.frame(date=1:5, price=runif(5))   # create a data frame with two columns
df2 <- data.frame(date=1:5, price=runif(5))   # runif(5) generates 5 random numbers

df <- rbind(cbind(df1,commodity='fig'),       # add a column called commodity to each data frame,
cbind(df2,commodity='apple'))     # then stack the two on top of each other
df

dfs <- list(cbind(df1,commodity='fig'),       # But what if you have a list of data frames to stack together,
cbind(df2,commodity='apple'))     # and the list could be any length?
df <- do.call('rbind', dfs)                   # does exactly the same as rbind(dfs[[1]],dfs[[2]],...)

#------------------------------------------------
# 2. Factors
#
# Things to learn:
#    summary
#    how factors look in a summary
#    creating factors, and specifying their levels
#    factors don't behave like strings
#    cut

summary(df)

df\$commodity

x <- factor(c('blue','red','green','blue','green'),
levels=c('red','green','blue','black'))
x

which(x=='green')

x[1] <- 'black'
x

x[1] <- 'orange'
x

c(x,'black')

cut(df\$price, breaks=c(0,.25,.5,.75,1))

#------------------------------------------------
# 3. Tabulation
#
# Things to learn:
#   grepl, for text-parsing
#   xtabs, the formula notation, subset
#   xtabs returns a matrix
#   sort

summary(logs)  # print out a summary of all the columns
logs[1:5,1:6]  # print out the first 5 rows, the first 6 columns

# grepl returns a vector of TRUEs and FALSEs, according to whether the elements of a character vector
# match a given regular expression. Regular expressions are a common feature of many languages,
# and they're widely used for text parsing.
logs\$crawled <- grepl('(crawler)|(bot)|(slurp)',logs\$browser)

logs[1:5, c('timestamp','request','url','status','searched','crawled')]

xtabs(~searched+crawled, data=logs)

# The first part of the url string is the owner of the web page.
# Whose web pages are most popular with searchers and crawlers?
logs\$reqtype <- factor(ifelse(logs\$searched,'search',ifelse(logs\$crawled,'crawl','browse')))
logs\$owner <- sub('(http:/)?/([^?/-]+).*','\\2',logs\$url)
logs[1:5,c('timestamp','url','reqtype','owner')]

x <- xtabs(~owner+reqtype, data=logs, subset= owner %in% c('damon','lu','marcus'))
x

x['damon',]

x[,'browse']

# Whose web pages are the most popular?
# Count how many requests are for each owner of a page.
# Sort this, highest to lowest, and find the 20 most popular owners.
x <- xtabs(~owner, data=logs)
sort(x, decreasing=TRUE)[1:20]

#-------------------------------------------------
# 4. Aggregation commands
#
# Things to learn:
#    split, process, reassemble, join.
#    xxtabs, from package djwutils
#    merge

library(djwutils)
xxtabs(size~owner, data=logs, FUN=mean)  # Produces error messages. Size is a factor, not a number.

logs\$size <- as.numeric(as.character(logs\$size))  # Convert a factor to a number, via string

# Split by owner, and in each chunk take the size column, and compute `median` on it
x <- xxtabs(size~owner, data=logs, FUN=median, subset=owner %in% c('damon','lu','marcus'))
x
as.data.frame(x)

# Combine the first 200 rows of logs, with the median file size for each row's owner
logs2 <- merge(logs[1:200,c(1:6,12)], as.data.frame(x),
by='owner')
logs2[1:10,]

# What are the first 10 log entries that are bigger than 10*median for that owner?
logs2[logs2\$size/logs2\$median>10,][1:10,]
```