Loading Data and First Summaries

library("gapminder")
## Structure
str(gapminder)
## 'data.frame':    1704 obs. of  6 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : num  1952 1957 1962 1967 1972 ...
##  $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
##  $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
##  $ gdpPercap: num  779 821 853 836 740 ...
## Top of data
head(gapminder)
##       country continent year lifeExp      pop gdpPercap
## 1 Afghanistan      Asia 1952  28.801  8425333   779.445
## 2 Afghanistan      Asia 1957  30.332  9240934   820.853
## 3 Afghanistan      Asia 1962  31.997 10267083   853.101
## 4 Afghanistan      Asia 1967  34.020 11537966   836.197
## 5 Afghanistan      Asia 1972  36.088 13079460   739.981
## 6 Afghanistan      Asia 1977  38.438 14880372   786.113
## Summary
summary(gapminder)
##         country        continent        year         lifeExp    
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.6  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.2  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.7  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.5  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.8  
##  Australia  :  12                  Max.   :2007   Max.   :82.6  
##  (Other)    :1632                                               
##       pop             gdpPercap     
##  Min.   :6.00e+04   Min.   :   241  
##  1st Qu.:2.79e+06   1st Qu.:  1202  
##  Median :7.02e+06   Median :  3532  
##  Mean   :2.96e+07   Mean   :  7215  
##  3rd Qu.:1.96e+07   3rd Qu.:  9326  
##  Max.   :1.32e+09   Max.   :113523  
## 
## Tabulation of one variable
table(gapminder$continent)
## 
##   Africa Americas     Asia   Europe  Oceania 
##      624      300      396      360       24
## Example aggregate() use
aggregate(lifeExp ~ continent, gapminder, median)
##   continent lifeExp
## 1    Africa 47.7920
## 2  Americas 67.0480
## 3      Asia 61.7915
## 4    Europe 72.2410
## 5   Oceania 73.6650

Single Country through Time, Standard Plot

plot(lifeExp ~ year, gapminder, subset = country == "Cambodia", type = "b")

Scatter plot: life expectancy as a function of GDP

plot(lifeExp ~ gdpPercap, gapminder, subset = year == 2007, log = "x")

Summaries via aggregate as a one-liner

aggregate(lifeExp ~ continent, data=subset(gapminder, year>=2000), median)
##   continent lifeExp
## 1    Africa 52.4060
## 2  Americas 72.8275
## 3      Asia 71.6570
## 4    Europe 78.1770
## 5   Oceania 80.2870

The plyr Package and its ddply Workhorse

suppressMessages(library("plyr"))
## Assign
maxLeByCont <- ddply(gapminder, ~ continent, summarize, maxLe = max(lifeExp))
maxLeByCont
##   continent  maxLe
## 1    Africa 76.442
## 2  Americas 80.653
## 3      Asia 82.603
## 4    Europe 81.757
## 5   Oceania 81.235
## Different summarize use
ddply(gapminder, ~ continent, summarize, nUniqCountries = length(unique(country)))
##   continent nUniqCountries
## 1    Africa             52
## 2  Americas             25
## 3      Asia             33
## 4    Europe             30
## 5   Oceania              2
## Or via function(), note different subsetting
ddply(gapminder, ~ continent, function(x) c(nUniqCountries = length(unique(x$country))))
##   continent nUniqCountries
## 1    Africa             52
## 2  Americas             25
## 3      Asia             33
## 4    Europe             30
## 5   Oceania              2
## multiple results
ddply(gapminder, ~ continent, 
      summarize, min_le = min(lifeExp), max_le = max(lifeExp), med_gdppc = median(gdpPercap))
##   continent min_le max_le med_gdppc
## 1    Africa 23.599 76.442   1192.14
## 2  Americas 37.579 80.653   5465.51
## 3      Asia 28.801 82.603   2646.79
## 4    Europe 43.585 81.757  12081.75
## 5   Oceania 69.120 81.235  17983.30

Fitter function

myLinFit <- function(dat, offset = 1952) {
  theFit <- lm(lifeExp ~ I(year - offset), dat)
  setNames(coef(theFit), c("intercept", "slope"))
}

myLinFit(subset(gapminder, country == "Canada"))
## intercept     slope 
## 68.883846  0.218869

Using this over all countries:

allcoefs <- ddply(gapminder, ~ country, myLinFit)
str(allcoefs)
## 'data.frame':    142 obs. of  3 variables:
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ intercept: num  29.9 59.2 43.4 32.1 62.7 ...
##  $ slope    : num  0.275 0.335 0.569 0.209 0.232 ...
head(allcoefs)
##       country intercept    slope
## 1 Afghanistan   29.9073 0.275329
## 2     Albania   59.2291 0.334683
## 3     Algeria   43.3750 0.569280
## 4      Angola   32.1267 0.209340
## 5   Argentina   62.6884 0.231708
## 6   Australia   68.4005 0.227724

Plot all coef pairs:

plot(allcoefs$intercept, allcoefs$slope)