RaukR 2018 Notes - Day 5: Bioconductor and GWAS

0.1 Bioconductor

Open source software for bioinformatics
Started in 2002, by Robert Gentleman
Started in a time of microarrays
Statistical analysis and comprehension of high-throughput genomic data
Over 1500 R packages
To install packages from bioconductor:
- source("https://bioconductor.org/biocLite.R")
- biocLite("package")
Under “all packages” on bioconductor, “workflow” packages contain workflows for various analyses

0.1.1 Use of Biostrings package

library(Biostrings)
library(dplyr)
library(ggplot2)
library(GenomicRanges)
library(airway)
library(SingleCellExperiment)

dna <- c("ACTAGACCA", "GGGTCGACA")

dna_set <- DNAStringSet(dna)

reverseComplement(dna_set)

##   A DNAStringSet instance of length 2
##     width seq
## [1]     9 TGGTCTAGT
## [2]     9 TGTCGACCC

# Get file from Biostrings package library files
filepath <- system.file(package="Biostrings", "extdata", "dm3_upstream2000.fa.gz")

readLines(filepath, 6)

## [1] ">NM_078863_up_2000_chr2L_16764737_f chr2L:16764737-16766736"
## [2] "gttggtggcccaccagtgccaaaatacacaagaagaagaaacagcatctt"         
## [3] "gacactaaaatgcaaaaattgctttgcgtcaatgactcaaaacgaaaatg"         
## [4] "tttttgtgcttttcgaacaaaaaattgggagcagaatattggattcgctt"         
## [5] "ttttcgcatcgaatatcttaagggagcaagggaagaaccatctagaataa"         
## [6] "taaagaagaccaaaatgtatcgtaactaaaggttttttttattaattatt"

# Read DNA seq from file to DNAstringset object
dna2 <- Biostrings::readDNAStringSet(filepath)

length(dna2)

## [1] 26454

reverseComplement(dna2)

##   A DNAStringSet instance of length 26454
##         width seq                                      names               
##     [1]  2000 ACCGTGCAACCGGTAAACT...CACTGGTGGGCCACCAAC NM_078863_up_2000...
##     [2]  2000 ATCGAGGATGACTTTCCGT...GGGCGCCTACATAAATAA NM_001201794_up_2...
##     [3]  2000 ATCGAGGATGACTTTCCGT...GGGCGCCTACATAAATAA NM_001201795_up_2...
##     [4]  2000 ATCGAGGATGACTTTCCGT...GGGCGCCTACATAAATAA NM_001201796_up_2...
##     [5]  2000 ATCGAGGATGACTTTCCGT...GGGCGCCTACATAAATAA NM_001201797_up_2...
##     ...   ... ...
## [26450]  2000 GTTTTATTGAAATTTAATT...TTTATTAGTCTTGTAAAT NM_001111010_up_2...
## [26451]  2000 AATATATAACAACTCAAAC...GGTCGTCCTTCGTATATC NM_001015258_up_2...
## [26452]  2000 AATATATAACAACTCAAAC...GGTCGTCCTTCGTATATC NM_001110997_up_2...
## [26453]  2000 AATATATAACAACTCAAAC...GGTCGTCCTTCGTATATC NM_001276245_up_2...
## [26454]  2000 CAATTTGTTCTTACACTTT...AGTTAACTAATACATACG NM_001015497_up_2...

# Calculates GC-content in each sequence
gc_cont <- letterFrequency(dna2, "GC", as.prob = TRUE)

class(gc_cont)

## [1] "matrix"

dim(gc_cont)

## [1] 26454     1

head(gc_cont)

##         G|C
## [1,] 0.3785
## [2,] 0.4300
## [3,] 0.4300
## [4,] 0.4300
## [5,] 0.4300
## [6,] 0.4300

hist(gc_cont)

# In ggplot2

gc_df <- as_tibble(gc_cont)

ggplot(gc_df, aes(`G|C`))+
  geom_histogram(color = "white")+
  theme_classic()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Getting the class of the DNAstringset class
getClass(Class = class(dna2))

## Class "DNAStringSet" [package "Biostrings"]
## 
## Slots:
##                                                             
## Name:               pool            ranges       elementType
## Class:    SharedRaw_Pool    GroupedIRanges         character
##                                           
## Name:    elementMetadata          metadata
## Class: DataTable_OR_NULL              list
## 
## Extends: 
## Class "XStringSet", directly
## Class "XRawList", by class "XStringSet", distance 2
## Class "XVectorList", by class "XStringSet", distance 3
## Class "List", by class "XStringSet", distance 4
## Class "Vector", by class "XStringSet", distance 5
## Class "list_OR_List", by class "XStringSet", distance 5
## Class "Annotated", by class "XStringSet", distance 6
## 
## Known Subclasses: "QualityScaledDNAStringSet"

# S4 packages are printed in this way
class(dna2)

## [1] "DNAStringSet"
## attr(,"package")
## [1] "Biostrings"

isS4(dna2)

## [1] TRUE

# List available methods in DNAStringSet
methods(class="DNAStringSet")

##   [1] !                               !=                             
##   [3] $                               $<-                            
##   [5] %in%                            [                              
##   [7] [[                              [[<-                           
##   [9] [<-                             <                              
##  [11] <=                              ==                             
##  [13] >                               >=                             
##  [15] aggregate                       alphabetFrequency              
##  [17] anyNA                           append                         
##  [19] as.character                    as.complex                     
##  [21] as.data.frame                   as.env                         
##  [23] as.factor                       as.integer                     
##  [25] as.list                         as.logical                     
##  [27] as.matrix                       as.numeric                     
##  [29] as.raw                          as.vector                      
##  [31] by                              c                              
##  [33] cbind                           chartr                         
##  [35] coerce                          compact                        
##  [37] compareStrings                  complement                     
##  [39] concatenateObjects              consensusMatrix                
##  [41] consensusString                 countOverlaps                  
##  [43] countPattern                    countPDict                     
##  [45] dinucleotideFrequencyTest       do.call                        
##  [47] droplevels                      duplicated                     
##  [49] elementMetadata                 elementMetadata<-              
##  [51] elementNROWS                    elementType                    
##  [53] eval                            expand                         
##  [55] expand.grid                     extractAt                      
##  [57] extractROWS                     Filter                         
##  [59] findOverlaps                    getListElement                 
##  [61] hasOnlyBaseLetters              head                           
##  [63] ifelse2                         intersect                      
##  [65] is.na                           is.unsorted                    
##  [67] isEmpty                         isMatchingEndingAt             
##  [69] isMatchingStartingAt            lapply                         
##  [71] length                          lengths                        
##  [73] letterFrequency                 match                          
##  [75] matchPattern                    matchPDict                     
##  [77] mcols                           mcols<-                        
##  [79] merge                           metadata                       
##  [81] metadata<-                      mstack                         
##  [83] names                           names<-                        
##  [85] nchar                           neditEndingAt                  
##  [87] neditStartingAt                 NROW                           
##  [89] nucleotideFrequencyAt           oligonucleotideFrequency       
##  [91] order                           overlapsAny                    
##  [93] PairwiseAlignments              PairwiseAlignmentsSingleSubject
##  [95] parallelSlotNames               parallelVectorNames            
##  [97] pcompare                        pcompareRecursively            
##  [99] PDict                           PWM                            
## [101] rank                            rbind                          
## [103] Reduce                          relist                         
## [105] relistToClass                   rename                         
## [107] rep                             rep.int                        
## [109] replaceAt                       replaceLetterAt                
## [111] replaceROWS                     rev                            
## [113] revElements                     reverse                        
## [115] reverseComplement               ROWNAMES                       
## [117] sapply                          seqinfo                        
## [119] seqinfo<-                       seqlevelsInUse                 
## [121] seqtype                         seqtype<-                      
## [123] setdiff                         setequal                       
## [125] setListElement                  shiftApply                     
## [127] show                            showAsCell                     
## [129] sort                            split                          
## [131] split<-                         stack                          
## [133] stringDist                      strsplit                       
## [135] subseq                          subseq<-                       
## [137] subset                          subsetByOverlaps               
## [139] table                           tail                           
## [141] tapply                          threebands                     
## [143] toString                        transform                      
## [145] translate                       trimLRPatterns                 
## [147] twoWayAlphabetFrequency         union                          
## [149] unique                          uniqueLetters                  
## [151] unlist                          unsplit                        
## [153] unstrsplit                      updateObject                   
## [155] values                          values<-                       
## [157] vcountPattern                   vcountPDict                    
## [159] vmatchPattern                   vwhichPDict                    
## [161] which.isMatchingEndingAt        which.isMatchingStartingAt     
## [163] whichPDict                      width                          
## [165] window                          window<-                       
## [167] windows                         with                           
## [169] within                          xtabs                          
## [171] xvcopy                          zipdown                        
## see '?methods' for accessing help and source code

0.1.2 GenomicRanges

gr <- GRanges(c("chr1:10-19","chr1:30-39"))
gr

## GRanges object with 2 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1     10-19      *
##   [2]     chr1     30-39      *
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

length(gr)

## [1] 2

# Add a column with sequence data
mcols(gr) <- data.frame(x = c(1,2))
gr

## GRanges object with 2 ranges and 1 metadata column:
##       seqnames    ranges strand |         x
##          <Rle> <IRanges>  <Rle> | <numeric>
##   [1]     chr1     10-19      * |         1
##   [2]     chr1     30-39      * |         2
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

width(gr)

## [1] 10 10

# Shifts the position of the ranges by 1
shift(gr, 1)

## GRanges object with 2 ranges and 1 metadata column:
##       seqnames    ranges strand |         x
##          <Rle> <IRanges>  <Rle> | <numeric>
##   [1]     chr1     11-20      * |         1
##   [2]     chr1     31-40      * |         2
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

# Introduce snps
snp <- GRanges(c("chr1:15","chr1:22"))

snp

## GRanges object with 2 ranges and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1        15      *
##   [2]     chr1        22      *
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

# Identify overlaps between snps and the genes
countOverlaps(snp, gr)

## [1] 1 0

snp[countOverlaps(snp, gr) > 0]

## GRanges object with 1 range and 0 metadata columns:
##       seqnames    ranges strand
##          <Rle> <IRanges>  <Rle>
##   [1]     chr1        15      *
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

findOverlaps(snp, gr)

## Hits object with 1 hit and 0 metadata columns:
##       queryHits subjectHits
##       <integer>   <integer>
##   [1]         1           1
##   -------
##   queryLength: 2 / subjectLength: 2

# Overlapping genes
gene <- GRanges(c("chr1:10-19","chr1:12-20","chr1:30-39"))

mcols(gene) <- data.frame(symbol=c("foo","bar","baz"))

gene

## GRanges object with 3 ranges and 1 metadata column:
##       seqnames    ranges strand |   symbol
##          <Rle> <IRanges>  <Rle> | <factor>
##   [1]     chr1     10-19      * |      foo
##   [2]     chr1     12-20      * |      bar
##   [3]     chr1     30-39      * |      baz
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

countOverlaps(gene)

## [1] 2 2 1

0.1.3 Airway dataset (RNASeq)

data(airway)

airway

## class: RangedSummarizedExperiment 
## dim: 64102 8 
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample

head(assay(airway))

##                 SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516
## ENSG00000000003        679        448        873        408       1138
## ENSG00000000005          0          0          0          0          0
## ENSG00000000419        467        515        621        365        587
## ENSG00000000457        260        211        263        164        245
## ENSG00000000460         60         55         40         35         78
## ENSG00000000938          0          0          2          0          1
##                 SRR1039517 SRR1039520 SRR1039521
## ENSG00000000003       1047        770        572
## ENSG00000000005          0          0          0
## ENSG00000000419        799        417        508
## ENSG00000000457        331        233        229
## ENSG00000000460         63         76         60
## ENSG00000000938          0          0          0

colData(airway)

## DataFrame with 8 rows and 9 columns
##            SampleName     cell      dex    albut        Run avgLength
##              <factor> <factor> <factor> <factor>   <factor> <integer>
## SRR1039508 GSM1275862   N61311    untrt    untrt SRR1039508       126
## SRR1039509 GSM1275863   N61311      trt    untrt SRR1039509       126
## SRR1039512 GSM1275866  N052611    untrt    untrt SRR1039512       126
## SRR1039513 GSM1275867  N052611      trt    untrt SRR1039513        87
## SRR1039516 GSM1275870  N080611    untrt    untrt SRR1039516       120
## SRR1039517 GSM1275871  N080611      trt    untrt SRR1039517       126
## SRR1039520 GSM1275874  N061011    untrt    untrt SRR1039520       101
## SRR1039521 GSM1275875  N061011      trt    untrt SRR1039521        98
##            Experiment    Sample    BioSample
##              <factor>  <factor>     <factor>
## SRR1039508  SRX384345 SRS508568 SAMN02422669
## SRR1039509  SRX384346 SRS508567 SAMN02422675
## SRR1039512  SRX384349 SRS508571 SAMN02422678
## SRR1039513  SRX384350 SRS508572 SAMN02422670
## SRR1039516  SRX384353 SRS508575 SAMN02422682
## SRR1039517  SRX384354 SRS508576 SAMN02422673
## SRR1039520  SRX384357 SRS508579 SAMN02422683
## SRR1039521  SRX384358 SRS508580 SAMN02422677

0.1.4 Tasks

Single-cell RNASeq

url <- "https://scrnaseq-public-datasets.s3.amazonaws.com/scater-objects/manno_mouse.rds"
fl <- BiocFileCache::bfcrpath(rnames = url)
sce <- readRDS(fl)

0.2 Genome-Wide Association Studies

Find genes associated with a specific trait
Create plot of phenotype vs genotype, then fit linear regression line

library(GenABEL)

0.2.1 Look at data

# Import data as gwaa class (S4)
genodata <- load.gwaa.data(
            phenofile = "phenotype.dat", 
            genofile = "genotype.raw", 
            makemap = F, 
            sort = F)

## ids loaded...
## marker names loaded...
## chromosome data loaded...
## map data loaded...
## allele coding data loaded...
## strand data loaded...
## genotype data loaded...
## snp.data object created...
## assignment of gwaa.data object FORCED; X-errors were not checked!

# Class: gwaa.data
class(genodata)

## [1] "gwaa.data"
## attr(,"package")
## [1] "GenABEL"

# Class: snp.data
class(genodata@gtdata)

## [1] "snp.data"
## attr(,"package")
## [1] "GenABEL"

# Extract genotype data
gtdata <- genodata@gtdata

# How many snps?
nsnps(gtdata)

## [1] 729048

# How many individuals?
nids(gtdata)

## [1] 200

# Individual 27, marker 5, 6 and 7
# Genotype is represented in binary format in @gtps (40, 40, 80)
gtdata[27, 5:7]

## @nids = 1 
## @nsnps = 3 
## @nbytes = 1 
## @idnames = id2_26 
## @snpnames = rs12124819 rs4040617 rs4970383 
## @chromosome = 1 1 1 
## @coding =  0b 04 04 
## @strand =  00 00 00 
## @map = 766409 769185 828418 
## @male = 0 
## @gtps = 
## 40 40 80

# Access the genotype non-binary
as.character(gtdata[27, 5:7])

##        rs12124819 rs4040617 rs4970383
## id2_26 "G/G"      "A/A"     "A/G"

# Summary of all individuals on 5 markers
# NoMeasured = how many individuals with the genotype on that row
sum_data <- summary(gtdata[,1:5])

head(sum_data)

plot(sum_data$Q.2)

# Phenotypic data
phenodata <- genodata@phdata

summary(phenodata)

##       id                 sex            pheno    
##  Length:200         Min.   :0.000   Min.   :0.0  
##  Class :character   1st Qu.:0.000   1st Qu.:0.0  
##  Mode  :character   Median :1.000   Median :0.5  
##                     Mean   :0.535   Mean   :0.5  
##                     3rd Qu.:1.000   3rd Qu.:1.0  
##                     Max.   :1.000   Max.   :1.0

# Access phenotype data
# Which sexes for which phenotype?
table(phenodata[,2:3])

##    pheno
## sex  0  1
##   0 43 50
##   1 57 50

0.2.2 Quality control

# Filter out people/markers with extremely low call rate
qc1 <- check.marker(genodata,
                    callrate = 0.95,
                    perid.call = 0.99,
                    maf = 0.05,
                    hweidsubset = genodata@phdata$pheno == 0)

## Excluding people/markers with extremely low call rate...
## 729048 markers and 200 people in total
## 0 people excluded because of call rate < 0.1 
## 0 markers excluded because of call rate < 0.1 
## Passed: 729048 markers and 200 people
## 
## RUN 1 
## 729048 markers and 200 people in total
## 105778 (14.50906%) markers excluded as having low (<5%) minor allele frequency
## 934 (0.1281123%) markers excluded because of low (<95%) call rate
## 0 (0%) markers excluded because they are out of HWE (FDR <0.2)
## 107 (53.5%) people excluded because of low (<99%) call rate
## Mean autosomal HET is 0.343121 (s.e. 0.006399162)
## 0 people excluded because too high autosomal heterozygosity (FDR <1%)
## Mean IBS is 0.7155259 (s.e. 0.007647288), as based on 2000 autosomal markers
## 0 (0%) people excluded because of too high IBS (>=0.95)
## In total, 622431 (85.37586%) markers passed all criteria
## In total, 93 (46.5%) people passed all criteria
## 
## RUN 2 
## 622431 markers and 93 people in total
## 11695 (1.878923%) markers excluded as having low (<5%) minor allele frequency
## 3 (0.0004819811%) markers excluded because of low (<95%) call rate
## 0 (0%) markers excluded because they are out of HWE (FDR <0.2)
## 0 (0%) people excluded because of low (<99%) call rate
## Mean autosomal HET is 0.3499146 (s.e. 0.006736866)
## 0 people excluded because too high autosomal heterozygosity (FDR <1%)
## Mean IBS is 0.7118663 (s.e. 0.008025326), as based on 2000 autosomal markers
## 0 (0%) people excluded because of too high IBS (>=0.95)
## In total, 610733 (98.12059%) markers passed all criteria
## In total, 93 (100%) people passed all criteria
## 
## RUN 3 
## 610733 markers and 93 people in total
## 0 (0%) markers excluded as having low (<5%) minor allele frequency
## 0 (0%) markers excluded because of low (<95%) call rate
## 0 (0%) markers excluded because they are out of HWE (FDR <0.2)
## 0 (0%) people excluded because of low (<99%) call rate
## Mean autosomal HET is 0.3499146 (s.e. 0.006736866)
## 0 people excluded because too high autosomal heterozygosity (FDR <1%)
## Mean IBS is 0.7163355 (s.e. 0.008108263), as based on 2000 autosomal markers
## 0 (0%) people excluded because of too high IBS (>=0.95)
## In total, 610733 (100%) markers passed all criteria
## In total, 93 (100%) people passed all criteria

class(qc1)

## [1] "check.marker"

str(qc1)

## List of 7
##  $ nofreq  : chr [1:117473] "rs2340587" "rs3748592" "rs2272756" "rs3748594" ...
##  $ nocall  : chr [1:937] "rs4648499" "rs4648360" "rs6424089" "rs4495664" ...
##  $ nohwe   : chr(0) 
##  $ idnocall: chr [1:107] "id2_0" "id2_1" "id2_3" "id2_4" ...
##  $ snpok   : chr [1:610733] "rs3131969" "rs3131967" "rs1048488" "rs12562034" ...
##  $ idok    : chr [1:93] "id2_2" "id2_5" "id2_6" "id2_12" ...
##  $ call    :List of 5
##   ..$ name      : chr [1:729048] "rs3131969" "rs3131967" "rs1048488" "rs12562034" ...
##   ..$ ids       : chr [1:200] "id2_0" "id2_1" "id2_2" "id2_3" ...
##   ..$ map       : Named num [1:729048] 744045 744197 750775 758311 766409 ...
##   .. ..- attr(*, "names")= chr [1:729048] "rs3131969" "rs3131967" "rs1048488" "rs12562034" ...
##   ..$ chromosome: Named chr [1:729048] "1" "1" "1" "1" ...
##   .. ..- attr(*, "names")= chr [1:729048] "rs3131969" "rs3131967" "rs1048488" "rs12562034" ...
##   ..$ call      : language check.marker(data = genodata, callrate = 0.95, perid.call = 0.99,      maf = 0.05, hweidsubset = genodata@phdata$pheno == 0)
##  - attr(*, "class")= chr "check.marker"

summary(qc1)

## $`Per-SNP fails statistics`
##           NoCall  NoMAF NoHWE Redundant Xsnpfail
## NoCall       842     95     0         0        0
## NoMAF         NA 117378     0         0        0
## NoHWE         NA     NA     0         0        0
## Redundant     NA     NA    NA         0        0
## Xsnpfail      NA     NA    NA        NA        0
## 
## $`Per-person fails statistics`
##             IDnoCall HetFail IBSFail isfemale ismale isXXY otherSexErr
## IDnoCall         107       0       0        0      0     0           0
## HetFail           NA       0       0        0      0     0           0
## IBSFail           NA      NA       0        0      0     0           0
## isfemale          NA      NA      NA        0      0     0           0
## ismale            NA      NA      NA       NA      0     0           0
## isXXY             NA      NA      NA       NA     NA     0           0
## otherSexErr       NA      NA      NA       NA     NA    NA           0

# Filter for the passed data only
data.clean <- genodata[qc1$idok, qc1$snpok]

# 107 persons had many snps that were uncalled
nids(genodata)

## [1] 200

nids(data.clean)

## [1] 93

# Change perid.call to 95 %
qc2 <- check.marker(genodata,
                    callrate = 0.95,
                    perid.call = 0.95,
                    maf = 0.05,
                    hweidsubset = genodata@phdata$pheno == 0)

## Excluding people/markers with extremely low call rate...
## 729048 markers and 200 people in total
## 0 people excluded because of call rate < 0.1 
## 0 markers excluded because of call rate < 0.1 
## Passed: 729048 markers and 200 people
## 
## RUN 1 
## 729048 markers and 200 people in total
## 105778 (14.50906%) markers excluded as having low (<5%) minor allele frequency
## 934 (0.1281123%) markers excluded because of low (<95%) call rate
## 0 (0%) markers excluded because they are out of HWE (FDR <0.2)
## 3 (1.5%) people excluded because of low (<95%) call rate
## Mean autosomal HET is 0.343121 (s.e. 0.006399162)
## 0 people excluded because too high autosomal heterozygosity (FDR <1%)
## Mean IBS is 0.7150325 (s.e. 0.007782171), as based on 2000 autosomal markers
## 0 (0%) people excluded because of too high IBS (>=0.95)
## In total, 622431 (85.37586%) markers passed all criteria
## In total, 197 (98.5%) people passed all criteria
## 
## RUN 2 
## 622431 markers and 197 people in total
## 1231 (0.1977729%) markers excluded as having low (<5%) minor allele frequency
## 84 (0.01349547%) markers excluded because of low (<95%) call rate
## 0 (0%) markers excluded because they are out of HWE (FDR <0.2)
## 0 (0%) people excluded because of low (<95%) call rate
## Mean autosomal HET is 0.3435735 (s.e. 0.006437736)
## 0 people excluded because too high autosomal heterozygosity (FDR <1%)
## Mean IBS is 0.7143686 (s.e. 0.007953909), as based on 2000 autosomal markers
## 0 (0%) people excluded because of too high IBS (>=0.95)
## In total, 621116 (99.78873%) markers passed all criteria
## In total, 197 (100%) people passed all criteria
## 
## RUN 3 
## 621116 markers and 197 people in total
## 0 (0%) markers excluded as having low (<5%) minor allele frequency
## 0 (0%) markers excluded because of low (<95%) call rate
## 0 (0%) markers excluded because they are out of HWE (FDR <0.2)
## 0 (0%) people excluded because of low (<95%) call rate
## Mean autosomal HET is 0.3435735 (s.e. 0.006437736)
## 0 people excluded because too high autosomal heterozygosity (FDR <1%)
## Mean IBS is 0.7118725 (s.e. 0.008213621), as based on 2000 autosomal markers
## 0 (0%) people excluded because of too high IBS (>=0.95)
## In total, 621116 (100%) markers passed all criteria
## In total, 197 (100%) people passed all criteria

summary(qc2)

## $`Per-SNP fails statistics`
##           NoCall  NoMAF NoHWE Redundant Xsnpfail
## NoCall       923     95     0         0        0
## NoMAF         NA 106914     0         0        0
## NoHWE         NA     NA     0         0        0
## Redundant     NA     NA    NA         0        0
## Xsnpfail      NA     NA    NA        NA        0
## 
## $`Per-person fails statistics`
##             IDnoCall HetFail IBSFail isfemale ismale isXXY otherSexErr
## IDnoCall           3       0       0        0      0     0           0
## HetFail           NA       0       0        0      0     0           0
## IBSFail           NA      NA       0        0      0     0           0
## isfemale          NA      NA      NA        0      0     0           0
## ismale            NA      NA      NA       NA      0     0           0
## isXXY             NA      NA      NA       NA     NA     0           0
## otherSexErr       NA      NA      NA       NA     NA    NA           0

data.clean2 <- genodata[qc2$idok, qc2$snpok]

# Only 3 individuals lost
nids(genodata)

## [1] 200

nids(data.clean2)

## [1] 197

0.2.3 Plotting

# Identify autosomal markers
autosomalMarkers <- autosomal(data.clean2)

gkin <- ibs(data.clean2[,autosomalMarkers], weight = "freq")

str(gkin)

##  num [1:197, 1:197] 0.50649 0.00246 -0.00155 -0.003 -0.00152 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:197] "id2_0" "id2_1" "id2_2" "id2_3" ...
##   ..$ : chr [1:197] "id2_0" "id2_1" "id2_2" "id2_3" ...
##  - attr(*, "Var")= num [1:197] 0.503 0.521 0.51 0.497 0.504 ...

# Genomic distance matrix
gdm <- as.dist(0.5 - gkin)
mds <- cmdscale(gdm)

plot(mds)

# Fit model

an <- qtscore(formula = pheno ~ sex, data.clean2, trait = "binomial")
sum(is.na(data.clean2@phdata$pheno))

## [1] 0

plot(an)

summary(an)

## Summary for top 10 results, sorted by P1df

estlambda(an[,"P1df"], plot = T)

## $estimate
## [1] 1.014491
## 
## $se
## [1] 2.867801e-05

invisible(capture.output(h2h <- polygenic(pheno~sex, kinship.matrix = gkin, data = data.clean2, llfun="polylik")))

## Warning in sqrt(eigValInvSig): NaNs produced

## Warning in log(es): NaNs produced

mm <- mmscore(h2object = h2h, data = data.clean2, clambda = T)

plot(mm)

RaukR 2018 Notes - Day 5: Bioconductor and GWAS

Håkon Kaspersen

15 June 2018

0.1 Bioconductor

0.1.1 Use of Biostrings package

0.1.2 GenomicRanges

0.1.3 Airway dataset (RNASeq)

0.1.4 Tasks

0.2 Genome-Wide Association Studies

0.2.1 Look at data

0.2.2 Quality control

0.2.3 Plotting