ggplot is part of the tidyverse() package

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats

what do we do when we plot data?

Class Chirality x y
Y L 0.881 0.216
Y R 0.435 0.586
Y R 0.463 0.337
Y R 0.526 0.498
Y L 0.821 0.021
N R 0.585 0.970
N R 0.795 0.735
N R 0.585 0.311
N L 0.156 0.889
N R 0.262 0.304

the grammar of graphics

  • data the data you want to plot
  • aesthetics how the data is mapped
  • geometries vizualization of the data
  • stats representations of data that aid understanding
  • coordinates space on which data is plotted
  • facets how plots are subsetted
  • themes non-data aspects of plots



  • basic plotting
  • scale adjustments
  • position adjustment
  • zooming
  • facetting
  • labels
  • themes


the basic syntax

ggplot(data = <DATA>, mapping = aes(<Mapping>)) +

ggplot constraints

example dataset (modified yeast gff file)

feature and chromosome are factors

##  [1] "CDS"         "chromosome"  "exon"        "gene"        "mRNA"       
##  [6] "ncRNA_gene"  "pseudogene"  "rRNA"        "rRNA_gene"   "snoRNA"     
## [11] "snoRNA_gene" "snRNA"       "snRNA_gene"  "transcript"  "tRNA_gene"
##  [1] "I"    "II"   "III"  "IV"   "IX"   "Mito" "V"    "VI"   "VII"  "VIII"
## [11] "X"    "XI"   "XII"  "XIII" "XIV"  "XV"   "XVI"

view structure using str()

## Classes 'tbl_df', 'tbl' and 'data.frame':    7443 obs. of  6 variables:
##  $ chromosome: Factor w/ 17 levels "I","II","III",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ feature   : Factor w/ 15 levels "CDS","chromosome",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ start     : int  10091 11565 12046 13363 21566 22395 24000 31567 33448 35155 ...
##  $ stop      : int  10399 11951 12426 13743 21850 22685 27968 32940 34701 36303 ...
##  $ strand    : Factor w/ 3 levels "-",".","+": 3 1 3 1 3 1 1 3 3 3 ...
##  $ length    : int  308 386 380 380 284 290 3968 1373 1253 1148 ...

histogram of feature lengths

ggplot(data = yeast_features, mapping = aes(x = length)) +

plot as a distribution

ggplot(data = yeast_features, mapping = aes(x = length)) +

color according to feature type

ggplot(data = yeast_features, mapping = aes(x = length)) +
        geom_freqpoly(mapping = aes(color = feature))


ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..,color = feature)) +

Try it yourself

a <- tibble(
Class = c("Y","Y","Y","Y","Y","N","N","N","N","N"),
Chirality = c("L","R","R","R","L","R","R","R","L","R"),
x = runif(10),
y = runif(10)


ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_point(aes(color = feature))

overplotting: jitter

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_point(aes(color = feature), position = "jitter")

overplotting: alpha

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_jitter(aes(color = feature), alpha = 1/10)


ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +

mapping aesthetics within layers

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length, color = feature)) +

Combining layers

ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_boxplot() + 
        geom_point(position = "jitter", shape = ".", mapping= aes(color = feature))


ggplot(data = yeast_features, mapping = aes(x = chromosome, y = length)) +
        geom_boxplot() + 
        geom_point(position = "jitter", shape = ".", mapping= aes(color = feature)) +


ggplot(data = yeast_features, mapping = aes(x = reorder(chromosome, length, FUN = median), y = length)) +
        geom_boxplot() + 
        geom_point(position = "jitter", shape = ".", mapping= aes(color = feature)) +

Flipping coordinates

ggplot(data = yeast_features, mapping = aes(x = reorder(chromosome, length, FUN = median), y = length)) +
        geom_boxplot() + 
        geom_point(position = "jitter", shape = ".", mapping= aes(color = feature)) +
        scale_y_log10() +


one categorical data (factors): chromosome

ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = chromosome)) +


two categorical data (factors): chromosome & features

ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = feature)) +

Barplot variant: fill

ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = feature)) +
        geom_bar(position = "fill")

Barplot variant: dodge

ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = feature)) +
        geom_bar(position = "dodge")


ggplot(data = yeast_features, mapping = aes(x = feature, y = length)) +
        stat_summary( = mean_sdl)


ggplot(data = yeast_features, mapping = aes(x = length, fill = chromosome)) +
        geom_histogram() + 
        facet_wrap( ~ chromosome) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Facetting (grid)

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_histogram() + 
        facet_wrap(feature ~ strand, nrow = 2) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Scatter plots

ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
        geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
        scale_x_log10() + 

Statistics: adding trend line

ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
        geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
        scale_x_log10() + 
        scale_y_log10() + 
## `geom_smooth()` using method = 'gam'

Statistics: controlling trend line

ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
        geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
        scale_x_log10() + 
        scale_y_log10() + 
        geom_smooth(method = "lm")

Colors (setting color)

ggplot(data = yeast_features, mapping = aes(x = length, fill = chromosome)) +
        geom_histogram(fill = "red") + 
        scale_x_log10() +
        facet_wrap( ~ chromosome) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Zooming: coord_cartesian()

ggplot(data = yeast_features, mapping = aes(x = start, y = stop)) +
        geom_point(aes(size = feature, color = feature, shape = feature), alpha = 1/3) +
        scale_x_log10() + 
        scale_y_log10() + 
        geom_smooth() +
        coord_cartesian(xlim = c(10,5000), ylim = c(10,5000))
## `geom_smooth()` using method = 'gam'


ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_freqpoly(mapping = aes(color = feature)) +
        scale_x_log10() +
                title = "Distribution of feature sizes",
                x = "length (base pairs)",
                y = "probability density"
        ) + 
        theme(legend.position = "bottom")

Themes : changes the overall look

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_freqpoly(mapping = aes(color = feature)) +
        scale_x_log10() +
                title = "Distribution of feature sizes",
                x = "length (base pairs)",
                y = "probability density"
        ) + 
        theme(legend.position = "bottom") + 


more themes available in add-on package ggthemes


Additional themes : theme_tufte()

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_freqpoly(mapping = aes(color = feature)) +
        scale_x_log10() +

Additional themes : theme_excel()

Don’t do this!

ggplot(data = yeast_features, mapping = aes(x = length, y = ..density..)) +
        geom_freqpoly(mapping = aes(color = feature)) +
        scale_x_log10() +

Generic plot function

ggplot(data = <DATA>, mapping = aes(<Mapping>)) +
        <GEOM_FUNCTION>() + 
        <STAT_FUNCTYION>() +
        <FACET_FUNCTION>() + 
        <SCALE_FUNCTION>() +

Concise code

ggplot(yeast_features, aes(length, ..density..)) +
        geom_freqpoly(aes(color = feature)) +
        scale_x_log10() +
        theme(legend.position = "bottom") + 
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing non-finite values (stat_bin).

saving plots as variables

my_plot <- ggplot(yeast_features, aes(length, ..density..)) +
        geom_freqpoly(aes(color = feature)) +
        scale_x_log10() +
                title = "Distribution of feature sizes",
                x = "length (base pairs)",
                y = "probability density"
        ) + 
        theme(legend.position = "bottom") + 

adding to plot variables

my_plot + geom_hline(yintercept = 2, color = "red")
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing non-finite values (stat_bin).

Exercise 1

Add: 1. new axis labels, 2. title, 3. trend line, and 4. change theme to tufte_theme

ggplot(data = a, aes(x = x, y = y, color = Class))+
        geom_point(size = 3)

Read in and modify gff

gff <- read_delim("Saccharomyces_cerevisiae.R64-1-1.34.gff3", 
    "\t", escape_double = FALSE, col_names = FALSE, 
    comment = "#", trim_ws = TRUE, skip = 24)
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_character(),
##   X3 = col_character(),
##   X4 = col_integer(),
##   X5 = col_integer(),
##   X6 = col_character(),
##   X7 = col_character(),
##   X8 = col_character(),
##   X9 = col_character()
## )
names(gff) <- c("chromosome", 
#correct data types
gff$feature = as.factor(gff$feature)
gff$chromosome = as.factor(gff$chromosome)
gff$strand = as.factor(gff$strand)

yeast_features <- gff %>%
        select(chromosome, feature, start, stop, strand) %>%
        mutate(length = abs(start - stop)) %>%
        filter(feature == "CDS" | feature == "rRNA" | feature == "snoRNA" | feature == "snRNA" | feature == "tRNA_gene")

Exercise 2

Plot as barplots with error bars

ggplot(data = yeast_features, mapping = aes(x = feature, y = length)) +
        stat_summary( = mean_sdl)

Exercise 3

Change bin size for histogram

ggplot(data = yeast_features, mapping = aes(x = length)) +