Manual

Introduction to R

Installation

Install R and RStudio.

RStudio basics

RStudio has four panels:

To get help about a function, type the function name with a question mark in front:

?data.frame

R packages

R packages are reusable libraries of code. To install and load packages from the console, do:

install.packages("ggplot2")
require(ggplot2)

This only works for packages which were published on CRAN. Nowadays packages are often published on GitHub. To install these packages, we can use the install_github function in the devtools package. Here we use the double colon syntax to automatically load the devtools package.

install.packages("devtools")
devtools::install_github("ropensci/rgbif")

Data types

Vectors

Vectors are the most basic data structure in R. They are ordered lists of values of a certain class such as numeric, character, or logical. Single values are vectors of length 1:

> a <- 1
> a
[1] 1
> class(a)
[1] "numeric"
> length(a)
[1] 1
> b <- "banana"
> b
[1] "banana"
> class(b)
[1] "character"
> d <- FALSE
> d
[1] FALSE
> class(d)
[1] "logical"
> a <- c(1, 2)
> a
[1] 1 2
> b <- seq(1, 10)
> b
[1]  1  2  3  4  5  6  7  8  9 10
> length(b)
[1] 10

Matrices

Matrices are two-dimensional data structures. Again, all elements are of the same class.

> matrix(1:6, nrow=3, ncol=2)
     [,1] [,2]
[1,]    1    4
[2,]    2    5
[3,]    3    6

Data frames

In data frames, the columns can be of different classes.

> data.frame(a=c(1, 2, 3), b=c("x", "y", "z"))
  a b
1 1 x
2 2 y
3 3 z
> d$a
[1] 1 2 3
> d[1]
  a
1 1
2 2
3 3
> d[1,]
  a b
1 1 x
> d[,1]
[1] 1 2 3
> d[["a"]]
[1] 1 2 3
> d[,"a"]
[1] 1 2 3

The dplyr package has a data frame wrapper, which produces prettier output when printing:

data(iris)
tbl_df(iris)

Lists

A list is a collection of objects.

> a <- data.frame(a=c(1, 2, 3), b=c("x", "y", "z"))
> l <- list(a=a, b=1)
> l
$a
  a b
1 1 x
2 2 y
3 3 z

$b
[1] 1
> l$a
  a b
1 1 x
2 2 y
3 3 z
> l[[1]]
  a b
1 1 x
2 2 y
3 3 z
> l[["a"]]
  a b
1 1 x
2 2 y
3 3 z

Factors

Reading data

Reading delimited text files

data <- read.table("data.txt", header=TRUE, sep="\t", dec=".", stringsAsFactors=FALSE)
data <- read.csv("data.csv")

Reading Excel files

Excel files can be read using the xlsx package.

require(xlsx)

data <- read.xlsx("data.xlsx", 1)
data <- read.xlsx("data.xlsx", sheetName="somesheet")

Reading shapefiles

Shapefiles can be read using the rgdal package. The example below also transforms the data so it can easily be visualized using ggplot2:

require(maptools)
require(rgdal)
require(ggplot2)

download.file("http://iobis.org/geoserver/OBIS/ows?service=WFS&version=1.0.0&request=GetFeature&typeName=OBIS:summaries&outputFormat=SHAPE-ZIP", destfile="summaries.zip")
unzip("summaries.zip")

shape <- readOGR("summaries.shp", layer="summaries")
shape@data$id <- rownames(shape@data)
df <- fortify(shape, region="id")
data <- merge(df, shape@data, by="id")

Reading from ZIP files

This example shows how to download a ZIP file and read one of the files it contains:

temp <- tempfile()
download.file("http://ipt.vliz.be/eurobis/archive.do?r=nsbs&v=1.1", temp)
data <- read.table(unz(temp, "occurrence.txt"), sep="\t", header=TRUE, stringsAsFactors=FALSE)
unlink(temp)

Inspecting data

require(robis)
require(dplyr)

data <- occurrence("Abra")

# for this example, convert back from data frame tbl (dplyr) to standard data frame
data <- as.data.frame(data)

head(data)
head(data, n = 100)
dim(data)
nrow(data)
ncol(data)
names(data)
str(data)
summary(data)
View(data)

# now convert to data frame tbl (dplyr)
data <- tbl_df(data)

data
head(data)
print(data, n = 100)

Manipulating data

Filtering

require(robis)
require(dplyr)

data <- occurrence("Abra")
data %>% filter(scientificName == "Abra alba" & yearcollected > 2005)

Reordering

data %>% arrange(datasetName, desc(eventDate))

Selecting and renaming columns

data %>% select(scientificName, eventDate, lon=decimalLongitude, lat=decimalLatitude)

select() can be used with distinct() to find unique combinations of values:

data %>% select(scientificName, locality) %>% distinct()

Adding columns

data %>% mutate(zone = .bincode(minimumDepthInMeters, breaks=c(0, 10, 100))) %>% select(minimumDepthInMeters, zone) %>% filter(!is.na(zone)) %>% print(n = 100)

Aggregation

data %>% summarise(lat_mean = mean(decimalLatitude), lat_sd = sd(decimalLatitude))
data %>% group_by(scientificName) %>% summarise(records=n(), datasets=n_distinct(datasetName))

Restructuring

This example converts a dataset from OBIS to a matrix format, which is more suitable for community analysis:

require(robis)
require(reshape2)

data <- occurrence(resourceid = 586)
wdata <- dcast(data, locality ~ scientificName, value.var = "individualCount")

And the other way around, from wide format to long format:

ldata <- melt(wdata, variable.name = "scientificName", value.name = "individualCount")

Plotting

In this example, data for one species is extracted from an OBIS dataset. Density and depth are visualized using the ggplot2 package:

require(robis)
require(dplyr)
require(reshape2)
require(ggplot2)

data <- occurrence(resourceid = 586)

afil <- data %>% filter(scientificName == "Amphiura filiformis") %>% group_by(locality) %>% summarise(n = mean(individualCount), lon = mean(decimalLongitude), lat = mean(decimalLatitude), depth = mean(minimumDepthInMeters))

ggplot() + geom_point(data = afil, aes(lon, lat, size = n, colour = depth)) +
  scale_colour_distiller(palette = "Spectral") +
  theme(panel.background = element_blank()) + coord_fixed(ratio = 1) + scale_size(range = c(2, 12))

Mapping

The leaflet can be used to create interactive web based maps. The example below shows the results of an outlier analysis of Verruca stroemia occurrences:


require(leaflet)

data <- occurrence("Verruca stroemia")

qcflag <- function(qc, number) {
  mask <- 2^(number-1)
  return(sapply(qc, function(x) {
    return(sum(bitwAnd(x, mask) > 0))
  }))
}

data$qcnum <- qcflag(data$qc, c(24, 28))

colors <- c("red", "orange", "green")[data$qcnum + 1]

m <- leaflet()
m <- addProviderTiles(m, "CartoDB.Positron")
m <- addCircleMarkers(m, data=data.frame(lat=data$decimalLatitude, lng=data$decimalLongitude), radius=3, weight=0, fillColor=colors, fillOpacity=0.5)
m