ggplot is part of the tidyverse() package

#install.packages("tidyverse")
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats

what do we do when we plot data?

Class Chirality x y
Y L 0.881 0.216
Y R 0.435 0.586
Y R 0.463 0.337
Y R 0.526 0.498
Y L 0.821 0.021
N R 0.585 0.970
N R 0.795 0.735
N R 0.585 0.311
N L 0.156 0.889
N R 0.262 0.304

what do we do when we plot data?

what do we do when we plot data?

what do we do when we plot data?

what do we do when we plot data?

what do we do when we plot data?

ggplot

the grammar of graphics

  • data the data you want to plot
  • aesthetics how the data is mapped
  • geometries vizualization of the data
  • stats representations of data that aid understanding
  • coordinates space on which data is plotted
  • facets how plots are subsetted
  • themes non-data aspects of plots

ggplot

topics

  • basic plotting
  • scale adjustments
  • position adjustment
  • zooming
  • facetting
  • labels
  • themes

ggplot

the basic syntax

ggplot(data = <DATA>, mapping = aes(<Mapping>)) +
        <GEOM_FUNCTION>()

ggplot constraints

example dataset (modified yeast gff file)

feature and chromosome are factors

levels(yeast_features$feature)
##  [1] "CDS"         "chromosome"  "exon"        "gene"        "mRNA"       
##  [6] "ncRNA_gene"  "pseudogene"  "rRNA"        "rRNA_gene"   "snoRNA"     
## [11] "snoRNA_gene" "snRNA"       "snRNA_gene"  "transcript"  "tRNA_gene"
levels(yeast_features$chromosome)
##  [1] "I"    "II"   "III"  "IV"   "IX"   "Mito" "V"    "VI"   "VII"  "VIII"
## [11] "X"    "XI"   "XII"  "XIII" "XIV"  "XV"   "XVI"

view structure using str()

str(yeast_features)
## Classes 'tbl_df', 'tbl' and 'data.frame':    7443 obs. of  6 variables:
##  $ chromosome: Factor w/ 17 levels "I","II","III",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ feature   : Factor w/ 15 levels "CDS","chromosome",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ start     : int  10091 11565 12046 13363 21566 22395 24000 31567 33448 35155 ...
##  $ stop      : int  10399 11951 12426 13743 21850 22685 27968 32940 34701 36303 ...
##  $ strand    : Factor w/ 3 levels "-",".","+": 3 1 3 1 3 1 1 3 3 3 ...
##  $ length    : int  308 386 380 380 284 290 3968 1373 1253 1148 ...

histogram of feature lengths

ggplot(data = yeast_features, mapping = aes(x = length)) +
        geom_histogram()

plot as a distribution

ggplot(data = yeast_features, mapping = aes(x = length)) +
        geom_freqpoly()

color according to feature type

ggplot(data = yeast_features, mapping = aes(x = length)) +
        geom_freqpoly(mapping = aes(color = feature))

density

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..,color = feature)) +
        geom_freqpoly()

Try it yourself

a <- tibble(
Class = c("Y","Y","Y","Y","Y","N","N","N","N","N"),
Chirality = c("L","R","R","R","L","R","R","R","L","R"),
x = runif(10),
y = runif(10)
)

geom_point

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_point(aes(color = feature))

overplotting: jitter

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_point(aes(color = feature), position = "jitter")

overplotting: alpha

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_jitter(aes(color = feature), alpha = 1/10)

geom_boxplot()

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_boxplot()

mapping aesthetics within layers

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length, color = feature)) +
        geom_boxplot()

Combining layers

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_boxplot() + 
        geom_point(position = "jitter", shape = ".", mapping= aes(color = feature))

Scales

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_boxplot() + 
        geom_point(position = "jitter", shape = ".", mapping= aes(color = feature)) +
        scale_y_log10()

Ordering

ggplot(data = yeast_features, mapping = aes(x = reorder(chromosome, length, FUN = median), y = length)) +
        geom_boxplot() + 
        geom_point(position = "jitter", shape = ".", mapping= aes(color = feature)) +
        scale_y_log10()

Flipping coordinates

ggplot(data = yeast_features, mapping = aes(x = reorder(chromosome, length, FUN = median), y = length)) +
        geom_boxplot() + 
        geom_point(position = "jitter", shape = ".", mapping= aes(color = feature)) +
        scale_y_log10() +
        coord_flip()

Barplots

one categorical data (factors): chromosome

ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = chromosome)) +
        geom_bar()

Barplots

two categorical data (factors): chromosome & features

ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = feature)) +
        geom_bar()

Barplot variant: fill

ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = feature)) +
        geom_bar(position = "fill")

Barplot variant: dodge

ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = feature)) +
        geom_bar(position = "dodge")

Statistics

ggplot(data = yeast_features, mapping = aes(x = feature, y = length)) +
        stat_summary(fun.data = mean_sdl)

Facetting

ggplot(data = yeast_features, mapping = aes(x = length, fill = chromosome)) +
        geom_histogram() + 
        facet_wrap( ~ chromosome) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Facetting (grid)

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_histogram() + 
        facet_wrap(feature ~ strand, nrow = 2) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Scatter plots

ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
        geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
        scale_x_log10() + 
        scale_y_log10()

Statistics: adding trend line

ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
        geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
        scale_x_log10() + 
        scale_y_log10() + 
        geom_smooth()
## `geom_smooth()` using method = 'gam'

Statistics: controlling trend line

ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
        geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
        scale_x_log10() + 
        scale_y_log10() + 
        geom_smooth(method = "lm")

Colors (setting color)

ggplot(data = yeast_features, mapping = aes(x = length, fill = chromosome)) +
        geom_histogram(fill = "red") + 
        scale_x_log10() +
        facet_wrap( ~ chromosome) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Zooming: coord_cartesian()

ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
        geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
        scale_x_log10() + 
        scale_y_log10() + 
        geom_smooth() +
        coord_cartesian(xlim = c(10,5000), ylim = c(10,5000))
## `geom_smooth()` using method = 'gam'

Labels

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_freqpoly(mapping = aes(color = feature)) +
        scale_x_log10() +
        labs(
                title = "Distribution of feature sizes",
                x = "length (base pairs)",
                y = "probability density"
        ) + 
        theme(legend.position = "bottom")

Themes : changes the overall look

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_freqpoly(mapping = aes(color = feature)) +
        scale_x_log10() +
        labs(
                title = "Distribution of feature sizes",
                x = "length (base pairs)",
                y = "probability density"
        ) + 
        theme(legend.position = "bottom") + 
        theme_light()

Themes

more themes available in add-on package ggthemes

#install.packages("ggthemes")
library(ggthemes)

Additional themes : theme_tufte()

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_freqpoly(mapping = aes(color = feature)) +
        scale_x_log10() +
        theme_tufte()

Additional themes : theme_excel()

Don’t do this!

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_freqpoly(mapping = aes(color = feature)) +
        scale_x_log10() +
        theme_excel()

Generic plot function

ggplot(data = <DATA>, mapping = aes(<Mapping>)) +
        <GEOM_FUNCTION>() + 
        <STAT_FUNCTYION>() +
        <FACET_FUNCTION>() + 
        <SCALE_FUNCTION>() +
        <THEME_FUNCTION>()
        

Concise code

ggplot(yeast_features, aes(length, ..density..)) +
        geom_freqpoly(aes(color = feature)) +
        scale_x_log10() +
        theme(legend.position = "bottom") + 
        theme_light()
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing non-finite values (stat_bin).

saving plots as variables

my_plot <- ggplot(yeast_features, aes(length, ..density..)) +
        geom_freqpoly(aes(color = feature)) +
        scale_x_log10() +
        labs(
                title = "Distribution of feature sizes",
                x = "length (base pairs)",
                y = "probability density"
        ) + 
        theme(legend.position = "bottom") + 
        theme_light()

adding to plot variables

my_plot + geom_hline(yintercept = 2, color = "red")
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing non-finite values (stat_bin).

Exercise 1

Add: 1. new axis labels, 2. title, 3. trend line, and 4. change theme to tufte_theme

ggplot(data = a, aes(x = x, y = y, color = Class))+
        geom_point(size = 3)

Read in and modify gff

gff <- read_delim("Saccharomyces_cerevisiae.R64-1-1.34.gff3", 
    "\t", escape_double = FALSE, col_names = FALSE, 
    comment = "#", trim_ws = TRUE, skip = 24)
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_character(),
##   X3 = col_character(),
##   X4 = col_integer(),
##   X5 = col_integer(),
##   X6 = col_character(),
##   X7 = col_character(),
##   X8 = col_character(),
##   X9 = col_character()
## )
names(gff) <- c("chromosome", 
                "source", 
                "feature", 
                "start",
                "stop", 
                "unknown1",
                "strand",
                "unknown2",
                "info"
                )
#correct data types
gff$feature = as.factor(gff$feature)
gff$chromosome = as.factor(gff$chromosome)
gff$strand = as.factor(gff$strand)

yeast_features <- gff %>%
        select(chromosome, feature, start, stop, strand) %>%
        mutate(length = abs(start - stop)) %>%
        filter(feature == "CDS" | feature == "rRNA" | feature == "snoRNA" | feature == "snRNA" | feature == "tRNA_gene")

Exercise 2

Plot as barplots with error bars

ggplot(data = yeast_features, mapping = aes(x = feature, y = length)) +
        stat_summary(fun.data = mean_sdl)

Exercise 3

Change bin size for histogram

ggplot(data = yeast_features, mapping = aes(x = length)) +
        geom_histogram()