Advanced_Data_Analysis_with_Elem_POV_Shalizi/time-series.R

# Examples for the lecture on time series

# Real-world data set
library(datasets)
data(lynx)

# Synthetic data set
  # Logistic map corrupted with observational noise
    # Exercise fo the student: why is this "logistic"?
logistic.map <- function(x,r=4) { r*x*(1-x) }
logistic.iteration <- function(n,x.init,r=4){
  x <- vector(length=n)
  x[1] <- x.init
  for (i in 1:(n-1)) {
    x[i+1] <- logistic.map(x[i],r=r)
  }
  return(x)
}
x <- logistic.iteration(1000,x.init=runif(1))
y <- x+rnorm(1000,mean=0,sd=0.05)

# plot all of the lynx series
plot(lynx)
# Plot the first part of the synthetic time series
plot(y[1:100],xlab="t",ylab=expression(y[t]),type="l")

# autocorrelation functions
acf(lynx)
acf(y)

# Convert a time series into a data frame of lagged values
# Input: a time series, maximum lag to use, whether older values go on the right
  # or the left
# Output: a data frame with (order+1) columns, named lag0, lag1, ... , and
  # length(ts)-order rows
design.matrix.from.ts <- function(ts,order,right.older=TRUE) {
  n <- length(ts)
  x <- ts[(order+1):n]
  for (lag in 1:order) {
    if (right.older) {
      x <- cbind(x,ts[(order+1-lag):(n-lag)])
    } else {
      x <- cbind(ts[(order+1-lag):(n-lag)],x)
    }
  }
  lag.names <- c("lag0",paste("lag",1:order,sep=""))
  if (right.older) {
    colnames(x) <- lag.names
  } else {
    colnames(x) <- rev(lag.names)
  }
  return(as.data.frame(x))
}

# Plot successive values of lynx against each other
plot(lag0 ~ lag1,data=design.matrix.from.ts(lynx,1),xlab=expression(lynx[t]),
  ylab=expression(lynx[t+1]),pch=16)
# Plot successive values of y against each other
plot(lag0 ~ lag1,data=design.matrix.from.ts(y,1),xlab=expression(y[t]),
  ylab=expression(y[t+1]),pch=16)

  # Exercise for student: write one function to do either of those plots


# Fit an additive autoregressive model
  # additive model fitting is outsourced to mgcv::gam, with splines
# Inputs: time series (x), order of autoregression (order)
# Output: fitted GAM object
aar <- function(ts,order) {
  stopifnot(require(mgcv))
  # Automatically generate a suitable data frame from the time series
  # and a formula to go along with it
  fit <- gam(as.formula(auto.formula(order)),
    data=design.matrix.from.ts(ts,order))
  return(fit)
}

# Generate formula for an autoregressive GAM of a specified order
# Input: order (integer)
# Output: a formula which looks like
  # "lag0 ~ s(lag1) + s(lag2) + ... + s(lagorder)"
auto.formula <- function(order) {
  inputs <- paste("s(lag",1:order,")",sep="",collapse="+")
    form <- paste("lag0 ~ ",inputs)
    return(form)
}


# Plot successive values of y against each other
plot(lag0 ~ lag1,data=design.matrix.from.ts(y,1),xlab=expression(y[t]),
  ylab=expression(y[t+1]),pch=16)
# Add the linear regression (which would be the AR(1) model)
abline(lm(lag0~lag1,data=design.matrix.from.ts(y,1)),col="red")
# Fit an AR(8) and add its fitted values
yar8 <- arma(y,order=c(8,0))
points(y,fitted(yar8),col="red")
# Fit a first-order nonparametric autoregression, add fitted values
yaar1 <- aar(y,order=1)
points(y[-length(y)],fitted(yaar1),col="blue")


# simple block bootstrap
# inputs: time series (ts), block length, length of output
# presumes: ts is a univariate time series
# output: one resampled time series
rblockboot <- function(ts,block.length,len.out=length(ts)) {
  # chop up ts into blocks
  the.blocks <- as.matrix(design.matrix.from.ts(ts,block.length-1,
    right.older=FALSE))
    # look carefully at design.matrix.from.ts to see why we need the -1
  # How many blocks is that?
  blocks.in.ts <- nrow(the.blocks)
  # Sanity-check
  stopifnot(blocks.in.ts == length(ts) - block.length+1)
  # How many blocks will we need (round up)?
  blocks.needed <- ceiling(len.out/block.length)
  # Sample blocks with replacement
  picked.blocks <- sample(1:blocks.in.ts,size=blocks.needed,replace=TRUE)
  # put the blocks in the randomly-selected order
  x <- the.blocks[picked.blocks,]
  # convert from a matrix to a vector and return
    # need transpose because R goes from vectors to matrices and back column by
    # column, not row by row
  x.vec <- as.vector(t(x))
    # Discard uneeded extra observations at the end silently
  return(x[1:len.out])
}

###############################################################################
# Exercises for the reader:
# 1. Modify this to do the circular block bootstrap (hint: extend ts)
# 2. Modify this to do block bootstrapping of multiple time series, using the
# # SAME blocks for all series (to preserve dependencies across series)
##############################################################################


# GDP per capita time series
  # Taken from the St. Louis Federal Reserve Bank's FRED data service
  # GDP in billions of constant (2005) dollars
  # http://research.stlouisfed.org/fred2/series/GDPCA?cid=106
  # over population in thousands
  # http://research.stlouisfed.org/fred2/series/POP?cid=104
  # so the defualt units are millions of dollars per capita, which is needlessly
  # hard to interpret
gdppc <- read.csv("gdp-pc.csv")
gdppc$y <- gdppc$y*1e6
plot(gdppc,log="y",type="l")