Outline

Load packages

library(tidyverse)
Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages -----------------------------------------------------------------------------------
filter(): dplyr, stats
lag():    dplyr, stats
library(stringr)

Tidy data

Is this table tidy?

table2

Is this table tidy?

table3

Is this table tidy?

table1

Is this table tidy?

gene_expression

Is this table tidy?

facs_data

what are tidy data?

Jeff Leek in his book The Elements of Data Analytic Style summarizes the characteristics of tidy data as the points:

gather()

used when column names are not names of variables, but values of a variable (e.g. time). makes tables longer and skinny (previously known as melting)

gene_expression

gather()

used when column names are not names of variables, but values of a variable (e.g. time). makes tables longer and skinny (previously known as melting)

gather(gene_expression, t0:t2, key = "timepoint", value = "expression")

spread()

Spreading is the opposite of gathering. Used when an observation is scattered across multiple rows. spread() makes tables shorter and wider

facs_data

spread()

Spreading is the opposite of gathering. Used when an observation is scattered across multiple rows. spread() makes tables shorter and wider

spread(facs_data, key = Measure, value = Value)

Exercise 1

put table2 in tidy format

table2[1:6,]

Exercise 2

convert table1 to table2

table1

data import

readr() has numerous functions for reading in files as tibbles

gff <- read_delim("Saccharomyces_cerevisiae.R64-1-1.34.gff3", 
    "\t", escape_double = FALSE, col_names = FALSE, 
    comment = "#", trim_ws = TRUE, skip = 24)
Parsed with column specification:
cols(
  X1 = col_character(),
  X2 = col_character(),
  X3 = col_character(),
  X4 = col_integer(),
  X5 = col_integer(),
  X6 = col_character(),
  X7 = col_character(),
  X8 = col_character(),
  X9 = col_character()
)

Peak at data

a tibble is a dataframe

head(gff)

Look at data structure with str()

str(gff)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   28848 obs. of  9 variables:
 $ X1: chr  "I" "I" "I" "I" ...
 $ X2: chr  "SGD" "SGD" "SGD" "SGD" ...
 $ X3: chr  "CDS" "gene" "mRNA" "exon" ...
 $ X4: int  10091 11565 11565 11565 11565 12046 12046 12046 12046 13363 ...
 $ X5: int  10399 11951 11951 11951 11951 12426 12426 12426 12426 13743 ...
 $ X6: chr  "." "." "." "." ...
 $ X7: chr  "+" "-" "-" "-" ...
 $ X8: chr  "0" "." "." "." ...
 $ X9: chr  "ID=CDS:YAL066W;Parent=transcript:YAL066W;protein_id=YAL066W" "ID=gene:YAL065C;biotype=protein_coding;description=Putative protein of unknown function%3B has homology to FLO1%3B possible pse"| __truncated__ "ID=transcript:YAL065C;Parent=gene:YAL065C;biotype=protein_coding;transcript_id=YAL065C" "Parent=transcript:YAL065C;Name=YAL065C.1;constitutive=1;ensembl_end_phase=0;ensembl_phase=0;exon_id=YAL065C.1;rank=1" ...
 - attr(*, "spec")=List of 2
  ..$ cols   :List of 9
  .. ..$ X1: list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ X2: list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ X3: list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ X4: list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ X5: list()
  .. .. ..- attr(*, "class")= chr  "collector_integer" "collector"
  .. ..$ X6: list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ X7: list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ X8: list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  .. ..$ X9: list()
  .. .. ..- attr(*, "class")= chr  "collector_character" "collector"
  ..$ default: list()
  .. ..- attr(*, "class")= chr  "collector_guess" "collector"
  ..- attr(*, "class")= chr "col_spec"

Look at the data the tidyverse way

using glimpse()

glimpse(gff)
Observations: 28,848
Variables: 9
$ X1 <chr> "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "...
$ X2 <chr> "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD"...
$ X3 <chr> "CDS", "gene", "mRNA", "exon", "CDS", "gene", "mRNA", "exon", "CDS", "gene", "mRNA", "exon", "CD...
$ X4 <int> 10091, 11565, 11565, 11565, 11565, 12046, 12046, 12046, 12046, 13363, 13363, 13363, 13363, 21566...
$ X5 <int> 10399, 11951, 11951, 11951, 11951, 12426, 12426, 12426, 12426, 13743, 13743, 13743, 13743, 21850...
$ X6 <chr> ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", "...
$ X7 <chr> "+", "-", "-", "-", "-", "+", "+", "+", "+", "-", "-", "-", "-", "+", "+", "+", "+", "-", "-", "...
$ X8 <chr> "0", ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", "...
$ X9 <chr> "ID=CDS:YAL066W;Parent=transcript:YAL066W;protein_id=YAL066W", "ID=gene:YAL065C;biotype=protein_...

Assign meaningful names to columns

same approach as naming dataframe columns in base R

names(gff) <- c("chromosome", 
                "source", 
                "feature", 
                "start",
                "stop", 
                "unknown1",
                "strand",
                "unknown2",
                "info"
                )

Dataframe now has meaningful names

note that tidyverse tries to guess data type

glimpse(gff)
Observations: 28,848
Variables: 9
$ chromosome <chr> "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I", "I"...
$ source     <chr> "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD...
$ feature    <chr> "CDS", "gene", "mRNA", "exon", "CDS", "gene", "mRNA", "exon", "CDS", "gene", "mRNA", "ex...
$ start      <int> 10091, 11565, 11565, 11565, 11565, 12046, 12046, 12046, 12046, 13363, 13363, 13363, 1336...
$ stop       <int> 10399, 11951, 11951, 11951, 11951, 12426, 12426, 12426, 12426, 13743, 13743, 13743, 1374...
$ unknown1   <chr> ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", "."...
$ strand     <chr> "+", "-", "-", "-", "-", "+", "+", "+", "+", "-", "-", "-", "-", "+", "+", "+", "+", "-"...
$ unknown2   <chr> "0", ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", ".", "0", "."...
$ info       <chr> "ID=CDS:YAL066W;Parent=transcript:YAL066W;protein_id=YAL066W", "ID=gene:YAL065C;biotype=...

assign columns proper datatypes

assigning correct data type is critical for anlayses and plotting with ggplot()

gff$feature = as.factor(gff$feature)
gff$chromosome = as.factor(gff$chromosome)
gff$strand = as.factor(gff$strand)
glimpse(gff)
Observations: 28,848
Variables: 9
$ chromosome <fctr> I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ...
$ source     <chr> "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD", "SGD...
$ feature    <fctr> CDS, gene, mRNA, exon, CDS, gene, mRNA, exon, CDS, gene, mRNA, exon, CDS, gene, mRNA, e...
$ start      <int> 10091, 11565, 11565, 11565, 11565, 12046, 12046, 12046, 12046, 13363, 13363, 13363, 1336...
$ stop       <int> 10399, 11951, 11951, 11951, 11951, 12426, 12426, 12426, 12426, 13743, 13743, 13743, 1374...
$ unknown1   <chr> ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", ".", "."...
$ strand     <fctr> +, -, -, -, -, +, +, +, +, -, -, -, -, +, +, +, +, -, -, -, -, -, -, -, -, +, +, +, +, ...
$ unknown2   <chr> "0", ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", ".", "0", ".", ".", ".", "0", "."...
$ info       <chr> "ID=CDS:YAL066W;Parent=transcript:YAL066W;protein_id=YAL066W", "ID=gene:YAL065C;biotype=...

Select columns using select()

gff <- select(gff, c("chromosome", "feature", "start", "stop", "strand"))
head(gff)

Add a column with mutate()

gff <- mutate(gff, length = abs(start - stop))
head(gff)

Sort tibble by column with arrange()

writing dplyr::arrange specifies the package and function

dplyr::arrange(gff,length)

Sort by feature size with arrange()

sort largest to smallest using -

dplyr::arrange(gff,-length)

Analyze with summarize()

this creates a new tibble/dataframe

summarise(gff, mean = mean(length), sd = sd(length), min = min(length), max = max(length), n = n())

Analyze with summarize()

the function n() counts how many observations their are

summarise(gff, mean = mean(length), sd = sd(length), min = min(length), max = max(length), n = n())

using the pipe: %>%

using the pipe: %>%

subset data with group_by()

gff %>%
mutate(length = abs(start - stop)) %>%
group_by(feature) %>%
summarise(mean = mean(length), sd = sd(length), min = min(length), max = max(length), n = n())

Filter rows with filter()

gff %>%
filter(feature != "mRNA" & feature != "rRNA_gene" & feature != "snoRNA_gene"& feature != "snRNA_gene") %>%
mutate(length = abs(start - stop)) %>%
group_by(feature) %>%
summarise(mean = mean(length), sd = sd(length), min = min(length), max = max(length), n = n()) 

Pass dataframe to ggplot for plotting

NOTE: ggplot uses + not the pipe %>%

gff %>%
filter(feature == c("CDS")) %>%
ggplot(aes(x = length)) + 
        geom_histogram(bins = 100)

Exercise 3

plot the population of each country in 1999 using %>% andggplot()`

table1

Exercise 3

String manipulation with stringr()

How do we get the gene names?

select(gff, info)

Separate values in column with separate()

gff %>%
mutate(length = abs(start - stop)) %>%
filter(feature == "gene") %>%
separate(col = "info", into = c("info1", "info2", "info3", "info4", "info5"), sep = ";", extra = "merge") %>%
separate(col = "info1", into = c("junk", "Systematic_name"), sep = ":") %>%
separate(col = "info2", into = c("junk2", "Gene"), sep = "Name=") %>%
separate(col = "info3", into = c("junk3", "Description1"), sep = "description=") %>%  
separate(col = "info4", into = c("junk4", "Description2"), sep = "description=") %>%
select(c(Description1, Description2))

Combine columns with unite()

gff %>%
mutate(length = abs(start - stop)) %>%
filter(feature == "gene") %>%
separate(col = "info", into = c("info1", "info2", "info3", "info4", "info5"), sep = ";", extra = "merge") %>%
separate(col = "info1", into = c("junk", "Systematic_name"), sep = ":") %>%
separate(col = "info2", into = c("junk2", "Gene"), sep = "Name=") %>%
separate(col = "info3", into = c("junk3", "Description1"), sep = "description=") %>%  
separate(col = "info4", into = c("junk4", "Description2"), sep = "description=") %>%    
unite(Description, Description1, Description2, sep = "") %>%
select(c( Description))

Save to a new variable

A general rule is if you are piping more than 10 steps save as a new variable

gff_clean <- gff %>%
mutate(length = abs(start - stop)) %>%
filter(feature == "gene") %>%
separate(col = "info", into = c("info1", "info2", "info3", "info4", "info5"), sep = ";", extra = "merge") %>%
separate(col = "info1", into = c("junk", "Systematic_name"), sep = ":") %>%
separate(col = "info2", into = c("junk2", "Gene"), sep = "Name=") %>%
separate(col = "info3", into = c("junk3", "Description1"), sep = "description=") %>%  
separate(col = "info4", into = c("junk4", "Description2"), sep = "description=") %>%    
unite(Description, Description1, Description2, sep = "") %>%
select(c(Systematic_name, Gene, Description))

Clean up strings with stringr()

gff_clean$Description <- str_replace_all(gff_clean$Description, "%3B", "")
gff_clean$Description <- str_replace_all(gff_clean$Description, "%2C", "")
gff_clean$Description <- str_replace_all(gff_clean$Description, "^NA", "")

gff_clean %>%
select(c(Description))

Write file

write_tsv(gff_clean, "Yeast_genes.txt", na = "NA")

How do we combine tables?

Mutating joins

A mutating join allows you combine variables from two tables by matchiung observations by their keys

1. Inner Join

matches pairs of observation from two tables whenever their keys are equal

2. Outer join

keeps observations that appear in at least one of the tables

  • left join keeps all the observations in x (should be the default)
  • right join keeps all the observations in y
  • full join keeps all observations in x and y

Filtering joins

affects (filters) the observations not the variables

  • semi_join(x, y) keeps all observations in x that have a match in y
  • anti_join(x, y) drops all observations in x that have a match in y

Dataset must contain common values (gene names)

str(data)

File to join with

str(gff_clean)

Joining data

dplyr::left_join(a, b, by = "x1") Join matching rows from b to a.

left_join(gff_clean, data, by = c("Systematic_name" = "Syst")) %>%
        str()

Excercise4

tidy data don’t allow correlation plots

tidy_gene_expression <- gene_expression %>%
        gather(t0:t2, key = "timepoint", value = "expression")
tidy_gene_expression

Rearrange the data

Plot correlation between t0 and t1

Excercise5

example

ggplot(data = yeast_features, mapping = aes(x = chromosome, fill = feature)) +
        geom_bar(position = "dodge")

Resources

