library("gapminder")
## Structure
str(gapminder)
## 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : num 1952 1957 1962 1967 1972 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : num 8425333 9240934 10267083 11537966 13079460 ...
## $ gdpPercap: num 779 821 853 836 740 ...
## Top of data
head(gapminder)
## country continent year lifeExp pop gdpPercap
## 1 Afghanistan Asia 1952 28.801 8425333 779.445
## 2 Afghanistan Asia 1957 30.332 9240934 820.853
## 3 Afghanistan Asia 1962 31.997 10267083 853.101
## 4 Afghanistan Asia 1967 34.020 11537966 836.197
## 5 Afghanistan Asia 1972 36.088 13079460 739.981
## 6 Afghanistan Asia 1977 38.438 14880372 786.113
## Summary
summary(gapminder)
## country continent year lifeExp
## Afghanistan: 12 Africa :624 Min. :1952 Min. :23.6
## Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.2
## Algeria : 12 Asia :396 Median :1980 Median :60.7
## Angola : 12 Europe :360 Mean :1980 Mean :59.5
## Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.8
## Australia : 12 Max. :2007 Max. :82.6
## (Other) :1632
## pop gdpPercap
## Min. :6.00e+04 Min. : 241
## 1st Qu.:2.79e+06 1st Qu.: 1202
## Median :7.02e+06 Median : 3532
## Mean :2.96e+07 Mean : 7215
## 3rd Qu.:1.96e+07 3rd Qu.: 9326
## Max. :1.32e+09 Max. :113523
##
## Tabulation of one variable
table(gapminder$continent)
##
## Africa Americas Asia Europe Oceania
## 624 300 396 360 24
## Example aggregate() use
aggregate(lifeExp ~ continent, gapminder, median)
## continent lifeExp
## 1 Africa 47.7920
## 2 Americas 67.0480
## 3 Asia 61.7915
## 4 Europe 72.2410
## 5 Oceania 73.6650
plot(lifeExp ~ year, gapminder, subset = country == "Cambodia", type = "b")
plot(lifeExp ~ gdpPercap, gapminder, subset = year == 2007, log = "x")
aggregate(lifeExp ~ continent, data=subset(gapminder, year>=2000), median)
## continent lifeExp
## 1 Africa 52.4060
## 2 Americas 72.8275
## 3 Asia 71.6570
## 4 Europe 78.1770
## 5 Oceania 80.2870
suppressMessages(library("plyr"))
## Assign
maxLeByCont <- ddply(gapminder, ~ continent, summarize, maxLe = max(lifeExp))
maxLeByCont
## continent maxLe
## 1 Africa 76.442
## 2 Americas 80.653
## 3 Asia 82.603
## 4 Europe 81.757
## 5 Oceania 81.235
## Different summarize use
ddply(gapminder, ~ continent, summarize, nUniqCountries = length(unique(country)))
## continent nUniqCountries
## 1 Africa 52
## 2 Americas 25
## 3 Asia 33
## 4 Europe 30
## 5 Oceania 2
## Or via function(), note different subsetting
ddply(gapminder, ~ continent, function(x) c(nUniqCountries = length(unique(x$country))))
## continent nUniqCountries
## 1 Africa 52
## 2 Americas 25
## 3 Asia 33
## 4 Europe 30
## 5 Oceania 2
## multiple results
ddply(gapminder, ~ continent,
summarize, min_le = min(lifeExp), max_le = max(lifeExp), med_gdppc = median(gdpPercap))
## continent min_le max_le med_gdppc
## 1 Africa 23.599 76.442 1192.14
## 2 Americas 37.579 80.653 5465.51
## 3 Asia 28.801 82.603 2646.79
## 4 Europe 43.585 81.757 12081.75
## 5 Oceania 69.120 81.235 17983.30
myLinFit <- function(dat, offset = 1952) {
theFit <- lm(lifeExp ~ I(year - offset), dat)
setNames(coef(theFit), c("intercept", "slope"))
}
myLinFit(subset(gapminder, country == "Canada"))
## intercept slope
## 68.883846 0.218869
Using this over all countries:
allcoefs <- ddply(gapminder, ~ country, myLinFit)
str(allcoefs)
## 'data.frame': 142 obs. of 3 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ intercept: num 29.9 59.2 43.4 32.1 62.7 ...
## $ slope : num 0.275 0.335 0.569 0.209 0.232 ...
head(allcoefs)
## country intercept slope
## 1 Afghanistan 29.9073 0.275329
## 2 Albania 59.2291 0.334683
## 3 Algeria 43.3750 0.569280
## 4 Angola 32.1267 0.209340
## 5 Argentina 62.6884 0.231708
## 6 Australia 68.4005 0.227724
Plot all coef pairs:
plot(allcoefs$intercept, allcoefs$slope)