diffHicUsersGuide/diffHicUG.tex at master · SmythLab/diffHicUsersGuide · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
\documentclass{report}\usepackage[]{graphicx}\usepackage[usenames,dvipsnames]{color}
%% maxwidth is the original width if it is less than linewidth
%% otherwise use linewidth (to make sure the graphics do not exceed the margin)
\makeatletter
\def\maxwidth{ %
  \ifdim\Gin@nat@width>\linewidth
    \linewidth
  \else
    \Gin@nat@width
  \fi
}
\makeatother

\definecolor{fgcolor}{rgb}{0.251, 0.251, 0.251}
\newcommand{\hlnum}[1]{\textcolor[rgb]{0.816,0.125,0.439}{#1}}%
\newcommand{\hlstr}[1]{\textcolor[rgb]{0.251,0.627,0.251}{#1}}%
\newcommand{\hlcom}[1]{\textcolor[rgb]{0.502,0.502,0.502}{\textit{#1}}}%
\newcommand{\hlopt}[1]{\textcolor[rgb]{0,0,0}{#1}}%
\newcommand{\hlstd}[1]{\textcolor[rgb]{0.251,0.251,0.251}{#1}}%
\newcommand{\hlkwa}[1]{\textcolor[rgb]{0.125,0.125,0.941}{#1}}%
\newcommand{\hlkwb}[1]{\textcolor[rgb]{0,0,0}{#1}}%
\newcommand{\hlkwc}[1]{\textcolor[rgb]{0.251,0.251,0.251}{#1}}%
\newcommand{\hlkwd}[1]{\textcolor[rgb]{0.878,0.439,0.125}{#1}}%
\let\hlipl\hlkwb

\newenvironment{knitrout}{}{} % an empty environment to be redefined in TeX
\usepackage{alltt}

\RequirePackage[]{Bioconductor2}
\AtBeginDocument{\bibliographystyle{unsrturl}}


\bioctitle[\Biocpkg{diffHic} User's Guide]{\Biocpkg{diffHic}: Differential analysis of Hi-C data \\ User's Guide}
%% also: \bioctitle{Title used for both header and title page}
%% or... \title{Title used for both header and title page}
\author[1]{Aaron T. L. Lun}
\affil[1]{The Walter and Eliza Hall Institute of Medical Research, Melbourne, Australia}

\usepackage{framed,color}
\newenvironment{combox}
{ \definecolor{shadecolor}{RGB}{255, 240, 240} \begin{shaded}\begin{center}\begin{minipage}[t]{0.95\textwidth} }
{ \end{minipage}\end{center}\end{shaded} \definecolor{shadecolor}{RGB}{240,240,240} }

\usepackage{bibentry}
\nobibliography*
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\begin{document}
\maketitle

\begin{abstract}
This document contains instructions on how to use \Biocpkg{diffHic} for differential interaction analyses of Hi-C data.
It covers counting of read pairs into bin pairs, filtering out low-abundance bin pairs, normalization of sample-specific trended biases,
variance modelling and hypothesis testing, and summarization and visualization of bin pairs.

\vspace{0.05in}
\textit{First edition:} 12 December 2012 \\
\textit{Last revised:} 10 October 2017 \\
\textit{Last compiled:} 12 October 2017
\end{abstract}

\packageVersion{diffHic 1.9.8}

\tableofcontents
\newpage


\chapter{Introduction}
\section{Scope}
This document describes the differential analysis of Hi-C data with the \Biocpkg{diffHic} package.
Differential interactions (DIs) are defined as interactions with significant changes in intensity between experimental conditions.
DIs are identified in a statistically rigorous manner using methods in the \Biocpkg{edgeR} package \cite{edgeR}.
Knowledge of \Biocpkg{edgeR} is useful but is not necessary for this guide.

\section{How to get help}
Most questions about individual functions should be answered by the documentation.
For example, if you want to know more about \Rfunction{preparePairs}, you can bring up the documentation by typing \Rcode{?preparePairs} or \Rcode{help(preparePairs)} at the \R{} prompt.
Further detail on the methods or the theory can be found in the references at the bottom of each help page.
The entire \Biocpkg{diffHic} pipeline is also described in its own publication \cite{lun2015diffhic}.

The authors of the package always appreciate receiving reports of bugs in the package functions or in the documentation.
The same goes for well-considered suggestions for improvements.
Other questions about how to use \Biocpkg{diffHic} are best sent to the Bioconductor support site at \url{https://support.bioconductor.org}.
Please send requests for general assistance and advice to the support site, rather than to the individual authors.
Users posting to the support site for the first time may find it helpful to read the posting guide at \url{http://www.bioconductor.org/help/support/posting-guide}.

\section{A brief description of Hi-C}
The Hi-C protocol \cite{lieberman2009comprehensive} is used to study chromatin organization by identifying pairwise interactions between two distinct genomic loci.
Briefly, chromatin is cross-linked and digested with a restriction enzyme.
This releases chromatin complexes into solution, where each complex contains multiple restriction fragments corresponding to interacting loci.
Overhangs are filled in with biotin-labelled nucleotides to form blunt ends.
Proximity ligation is then performed, which favors ligation between blunt ends in the same complex.
The ligated DNA is sonicated and the sheared fragments containing ligation junctions are purified by a streptavidin pulldown.
Paired-end sequencing is performed on the purified ligation products, and pairs of interacting loci are identified by mapping the paired reads.
Of course, some caution is required due to the presence of non-specific ligation between blunt ends in different complexes.

\section{Quick start}
A typical differential analysis of Hi-C data is described below.
For simplicity, assume that the BAM files have already been processed into ``index'' files in \Robject{input}.
Let \Robject{design} contain the design matrix for this experiment.
Also assume that the boundaries of the relevant restriction fragments are present in \Robject{fragments}.
The code itself is split across several steps:


% saveRDS(fragments, file="mm10-hindIII.rds")

\begin{enumerate}
\item Converting BAM files to index files.
\item Counting read pairs into pairs of genomic bins.
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlkwd{library}\hlstd{(diffHic)}
\hlstd{param} \hlkwb{<-} \hlkwd{pairParam}\hlstd{(}\hlkwc{fragments}\hlstd{=fragments)}
\hlstd{data} \hlkwb{<-} \hlkwd{squareCounts}\hlstd{(input,} \hlkwc{width}\hlstd{=}\hlnum{1e6}\hlstd{,} \hlkwc{param}\hlstd{=param)}
\end{alltt}
\end{kframe}
\end{knitrout}
\item Filtering out uninteresting bin pairs.
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlkwd{library}\hlstd{(edgeR)}
\hlstd{keep} \hlkwb{<-} \hlkwd{aveLogCPM}\hlstd{(}\hlkwd{asDGEList}\hlstd{(data))} \hlopt{>} \hlnum{0}
\hlstd{data} \hlkwb{<-} \hlstd{data[keep,]}
\end{alltt}
\end{kframe}
\end{knitrout}
\item Normalizing for sample-specific biases.
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{data} \hlkwb{<-} \hlkwd{normOffsets}\hlstd{(data,} \hlkwc{type}\hlstd{=}\hlstr{"loess"}\hlstd{,} \hlkwc{se.out}\hlstd{=}\hlnum{TRUE}\hlstd{)}
\hlstd{y} \hlkwb{<-} \hlkwd{asDGEList}\hlstd{(data)}
\end{alltt}
\end{kframe}
\end{knitrout}
\item Modelling biological variability between replicates.
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{y} \hlkwb{<-} \hlkwd{estimateDisp}\hlstd{(y, design)}
\hlstd{fit} \hlkwb{<-} \hlkwd{glmQLFit}\hlstd{(y, design,} \hlkwc{robust}\hlstd{=}\hlnum{TRUE}\hlstd{)}
\end{alltt}
\end{kframe}
\end{knitrout}
\item Testing for significant differences between groups.
\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{result} \hlkwb{<-} \hlkwd{glmQLFTest}\hlstd{(fit)}
\hlstd{clusters} \hlkwb{<-} \hlkwd{diClusters}\hlstd{(data, result}\hlopt{$}\hlstd{table,} \hlkwc{target}\hlstd{=}\hlnum{0.05}\hlstd{,}
                       \hlkwc{cluster.args}\hlstd{=}\hlkwd{list}\hlstd{(}\hlkwc{tol}\hlstd{=}\hlnum{1}\hlstd{))}
\end{alltt}
\end{kframe}
\end{knitrout}
\end{enumerate}
In the various examples for this guide, data will be used from three studies.
The first dataset examines the chromatin structure in K562 and GM06990 cell lines \cite{lieberman2009comprehensive}.
The second compares interaction intensities between wild-type and cohesin-deficient murine neural stem cells \cite{sofueva2013cohesin}.
The final study compares ERG-overexpressing RWPE1 cells with a GFP-expressing control \cite{rickman2012oncogene}.
Obviously, readers will have to modify the code for their own analyses.

\chapter{Preparing index files from BAM files}
\label{chap:prep}

\begin{combox}
This box will appear at the start of each chapter, describing the objects required from previous chapters.
As we're starting out here, we don't really need anything.
\end{combox}

\section{A comment on aligning Hi-C libraries}
\label{sec:align}

In a typical Hi-C library, sequencing will occasionally be performed over the ligation junction between two restriction fragments.
This forms a chimeric read that contains sequences from two distinct genomic loci.
Here, correct alignment of the 5$'$ end of the chimeric read is more important.
This is because the location of the 3$'$ end is already provided by mapping the 5$'$ end of the mate read.
Direct application of alignment software will not be optimal as only one mapping location will be usually reported for each read.
This means that the 5$'$ location will be ignored if the 3$'$ alignment is superior, e.g., longer or fewer mismatches.

Instead, chimeric alignment can be performed with approaches like iterative mapping \cite{imakaev2012iterative} or read splitting \cite{seitan2013cohesin}.
In the latter, we define the ligation signature as the sequence that is obtained after ligation between blunt ends derived from cleaved restriction sites.
For example, the \textit{Hin}dIII enzyme cleaves at \Rcode{AAGCTT} with a 4 bp overhang.
This yields a signature sequence of \Rcode{AAGCTAGCTT} upon ligation of blunt ends.
The ligation signature in each chimeric read is identified with \software{cutadapt} \cite{martin2011cutadapt}, and the read is split into two segments at the center of the signature.
Each segment is then aligned separately to the reference genome using \software{Bowtie2} \cite{langmead2012bowtie}.
Mapping by read splitting is performed in \Biocpkg{diffHic} using a custom Python script:

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlkwd{system.file}\hlstd{(}\hlstr{"python"}\hlstd{,} \hlstr{"presplit_map.py"}\hlstd{,} \hlkwc{package}\hlstd{=}\hlstr{"diffHic"}\hlstd{,} \hlkwc{mustWork}\hlstd{=}\hlnum{TRUE}\hlstd{)}
\end{alltt}
\end{kframe}
\end{knitrout}

Users are strongly recommended to synchronize mate pair information and to mark duplicate read pairs in the resulting BAM file.
This can be achieved using the various tools in the \software{Picard} software suite (\url{http://broadinstitute.github.io/picard}).

\section{Matching mapped reads to restriction fragments}
The Hi-C protocol is based on ligation between restriction fragments.
Sequencing of the ligation product is performed to identify the interacting loci --  or, more precisely, the two restriction fragments containing the interacting loci.
The resolution of Hi-C data is inherently limited by the frequency of restriction sites and the size of the restriction fragments.
Thus, it makes sense to report the read alignment location in terms of the restriction fragment to which that read was mapped.
The boundaries of each restriction fragment can be obtained with the \Rfunction{cutGenome} function, as shown below for the human genome after digestion with the \textit{Hin}dIII restriction enzyme (recognition site of \Rcode{AAGCTT}, 5$'$ overhang of 4 bp).

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlkwd{library}\hlstd{(BSgenome.Hsapiens.UCSC.hg19)}
\hlstd{hs.frag} \hlkwb{<-} \hlkwd{cutGenome}\hlstd{(BSgenome.Hsapiens.UCSC.hg19,} \hlstr{"AAGCTT"}\hlstd{,} \hlnum{4}\hlstd{)}
\hlstd{hs.frag}
\end{alltt}
\begin{verbatim}
## GRanges object with 846225 ranges and 0 metadata columns:
##                  seqnames         ranges strand
##                     <Rle>      <IRanges>  <Rle>
##        [1]           chr1 [    1, 16011]      *
##        [2]           chr1 [16008, 24575]      *
##        [3]           chr1 [24572, 27985]      *
##        [4]           chr1 [27982, 30433]      *
##        [5]           chr1 [30430, 32157]      *
##        ...            ...            ...    ...
##   [846221] chrUn_gl000249 [22854, 25904]      *
##   [846222] chrUn_gl000249 [25901, 31201]      *
##   [846223] chrUn_gl000249 [31198, 36757]      *
##   [846224] chrUn_gl000249 [36754, 36891]      *
##   [846225] chrUn_gl000249 [36888, 38502]      *
##   -------
##   seqinfo: 93 sequences from hg19 genome
\end{verbatim}
\end{kframe}
\end{knitrout}

These fragments should be stored in a \Rclass{pairParam} object.
The constructor below checks that the fragment ranges are properly ordered.
Later, this object will also hold other parameters for counting.
This simplifies coordination of the various steps in the \Biocpkg{diffHic} pipeline, as the same \Rclass{pairParam} object can be easily passed between different functions.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{hs.param} \hlkwb{<-} \hlkwd{pairParam}\hlstd{(hs.frag)}
\hlstd{hs.param}
\end{alltt}
\begin{verbatim}
## Genome contains 846225 restriction fragments across 93 chromosomes
## No discard regions are specified
## No limits on chromosomes for read extraction
## No cap on the read pairs per pair of restriction fragments
\end{verbatim}
\end{kframe}
\end{knitrout}

The \Rfunction{preparePairs} function matches the mapping location of each read to a restriction fragment in the reference genome.
Mapping results for paired reads should be provided in a name-sorted BAM file.
The function converts the read position into an index, pointing to the matching restriction fragment in \Robject{hs.frag}.
The resulting pairs of indices are stored in an index file using the HDF5 format.
The fragments to which the reads mapped are referred to as ``anchors'' -- the larger index is defined as the first anchor fragment whereas the smaller is the second.
This process is demonstrated using Hi-C data generated from GM06990 cells.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{diagnostics} \hlkwb{<-} \hlkwd{preparePairs}\hlstd{(}\hlstr{"SRR027957.bam"}\hlstd{, hs.param,} \hlkwc{file}\hlstd{=}\hlstr{"SRR027957.h5"}\hlstd{,}
                            \hlkwc{dedup}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{minq}\hlstd{=}\hlnum{10}\hlstd{)}
\hlkwd{names}\hlstd{(diagnostics)}
\end{alltt}
\begin{verbatim}
## [1] "pairs"    "same.id"  "singles"  "chimeras"
\end{verbatim}
\end{kframe}
\end{knitrout}

The function itself returns a list of diagnostics showing the number of read pairs that are lost for various reasons.
Of particular note is the removal of reads that are potential PCR duplicates with \Rcode{dedup=TRUE}.
This requires marking of the reads beforehand using an appropriate program such as \software{MarkDuplicates} from the \software{Picard} suite.
Filtering on the minimum mapping quality score with \Rcode{minq} is also recommended to remove spurious alignments.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{diagnostics}\hlopt{$}\hlstd{pairs}
\end{alltt}
\begin{verbatim}
##    total   marked filtered   mapped
##  7068675   103594  1532760  5460120
\end{verbatim}
\end{kframe}
\end{knitrout}

Read pairs mapping to the same restriction fragment provide little information on interactions between fragments \cite{belton2012hic}.
Dangling ends are inward-facing read pairs that are mapped to the same fragment.
These are uninformative as they are usually formed from sequencing of the restriction fragment prior to ligation.
Self-circles are outward-facing read pairs that are formed when two ends of the same restriction fragment ligate to one another.
Interactions within a fragment cannot be easily distinguished from these self-circularization events.
Both structures are removed to avoid confusion in downstream steps.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{diagnostics}\hlopt{$}\hlstd{same.id}
\end{alltt}
\begin{verbatim}
##    dangling self.circle
##      425219      138248
\end{verbatim}
\end{kframe}
\end{knitrout}

\section{Processing of chimeric reads}
In the BAM file, chimeric reads are represented by two separate alignments for each read.
Hard clipping is used to denote the length trimmed from each sequence in each alignment, and to determine which alignment corresponds to the 5$'$ or 3$'$ end of the read.
Only the 5$'$ end(s) is used to determine the restriction fragment index for that read pair.
The total number of chimeric read pairs will be reported by \Rfunction{preparePairs}, along with the number of pairs where both 5$'$ ends are mapped (\Rcode{mapped}) and the nuber where both 5$'$ ends and at least one 3$'$ end are mapped (\Rcode{multi}).
Of course, the function will also work if the mapping location is only given for the 5$'$ end, though chimeric statistics will not be properly computed.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{diagnostics}\hlopt{$}\hlstd{chimeras}
\end{alltt}
\begin{verbatim}
##   total  mapped   multi invalid
## 2495159 1725843 1040864   67989
\end{verbatim}
\end{kframe}
\end{knitrout}

The proportion of invalid chimeric pairs is also calculated by \Rfunction{preparePairs}.
Invalid pairs are those where the 3$'$ location of a chimeric read disagrees with the 5$'$ location of the mate.
The invalid proportion can be used as an empirical measure of the mapping error rate - or, at least, the upper bound thereof, given that error rates are likely to be lower for longer, non-chimeric alignments.
High error rates may be indicative of a fault in the mapping pipeline.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{diagnostics}\hlopt{$}\hlstd{chimeras[[}\hlstr{"invalid"}\hlstd{]]}\hlopt{/}\hlstd{diagnostics}\hlopt{$}\hlstd{chimeras[[}\hlstr{"multi"}\hlstd{]]}
\end{alltt}
\begin{verbatim}
## [1] 0.06531977
\end{verbatim}
\end{kframe}
\end{knitrout}

Invalid chimeric pairs can be discarded by setting \Rcode{ichim=FALSE} in \Rfunction{preparePairs}.
However, this is not recommended for routine analyses.
Mapping errors for short 3$'$ ends may result in apparent invalidity and loss of the (otherwise correct) 5$'$ alignments.

\section{Filtering artifactual read pairs}

\subsection{Reprocessing index files for quality control}
The \Rfunction{prunePairs} function removes read pairs that correspond to artifacts in the Hi-C procedure.
The returned vector contains the number of read pairs removed for each type of artifact.
Values of \Rcode{length}, \Rcode{inward} and \Rcode{outward} correspond to removal by \Rcode{max.frag}, \Rcode{min.inward} and \Rcode{min.outward}, respectively.
Retained read pairs are stored in another index file for later use.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{min.inward} \hlkwb{<-} \hlnum{1000}
\hlstd{min.outward} \hlkwb{<-} \hlnum{25000}
\hlkwd{prunePairs}\hlstd{(}\hlstr{"SRR027957.h5"}\hlstd{, hs.param,} \hlkwc{file.out}\hlstd{=}\hlstr{"SRR027957_trimmed.h5"}\hlstd{,}
           \hlkwc{max.frag}\hlstd{=}\hlnum{600}\hlstd{,} \hlkwc{min.inward}\hlstd{=min.inward,} \hlkwc{min.outward}\hlstd{=min.outward)}
\end{alltt}
\begin{verbatim}
##    total   length   inward  outward retained
##  4896653   870339    94644    82964  3860024
\end{verbatim}
\end{kframe}
\end{knitrout}

The \Rcode{max.frag} argument removes read pairs where the inferred length of the sequencing fragment (i.e., the ligation product) is greater than a specified value.
The length of the sequencing fragment is computed by summing, for both reads, the distance between the mapping location of the 5$'$ end and the nearest restriction site on the 3$'$ side.
Excessively large lengths are indicative of off-site cleavage, i.e., where the restriction enzyme or some other agent cuts the DNA at a location other than the restriction site.
While not completely uninformative, these are discarded as they are not expected from the Hi-C protocol.
The threshold value can be chosen based on the size selection interval in library preparation, or by examining the distribution of inferred fragment lengths from \Rfunction{getPairData}.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{diags} \hlkwb{<-} \hlkwd{getPairData}\hlstd{(}\hlstr{"SRR027957.h5"}\hlstd{, hs.param)}
\hlkwd{hist}\hlstd{(diags}\hlopt{$}\hlstd{length[diags}\hlopt{$}\hlstd{length} \hlopt{<} \hlnum{1000}\hlstd{],} \hlkwc{ylab}\hlstd{=}\hlstr{"Frequency"}\hlstd{,}
     \hlkwc{xlab}\hlstd{=}\hlstr{"Spacing (bp)"}\hlstd{,} \hlkwc{main}\hlstd{=}\hlstr{""}\hlstd{,} \hlkwc{col}\hlstd{=}\hlstr{"grey80"}\hlstd{)}
\end{alltt}
\end{kframe}\begin{adjustwidth}{2\fltoffset}{2\fltoffset}
\includegraphics[width=\maxwidth]{plots-ug/fraglenhist-1} \end{adjustwidth}
\end{knitrout}

The insert size is defined as the linear distance between two paired reads on the same chromosome.
The \Rcode{min.inward} paramater removes inward-facing intra-chromosomal read pairs where the insert size is less than the specified value.
The \Rcode{min.outward} parameter does the same for outward-facing intra-chromosomal read pairs.
This is designed to remove dangling ends or self-circles involving DNA fragments that have not been completely digested \cite{jin2013highres}.
Such read pairs are technical artifacts that are (incorrectly) retained by \Rfunction{preparePairs}, as the two reads involved are mapped to different restriction fragments.
Appropriate thresholds for both parameters can be determined using strand orientation plots.

\subsection{Setting size thresholds with strand orientation plots}
\label{sec:strorient}
The strand orientation for a read pair refers to the combination of strands for the two alignments.
These are stored as flags where setting \Rcode{0x1} or \Rcode{0x2} means that the read on the first or second anchor fragment, respectively, is mapped on the reverse strand.
If different pieces of DNA were randomly ligated together, one would expect to observe equal proportions of all strand orientations.
This can be tested by examining the distribution of strand orientations for inter-chromosomal read pairs.
Each orientation is equally represented across these read pairs, which is expected as different chromosomes are distinct pieces of DNA.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{intra} \hlkwb{<-} \hlopt{!}\hlkwd{is.na}\hlstd{(diags}\hlopt{$}\hlstd{insert)}
\hlkwd{table}\hlstd{(diags}\hlopt{$}\hlstd{orientation[}\hlopt{!}\hlstd{intra])}
\end{alltt}
\begin{verbatim}
##
##      0      1      2      3
## 768247 764801 764579 764536
\end{verbatim}
\end{kframe}
\end{knitrout}

This can be repeated for intra-chromosomal read pairs, by plotting the distribution of insert sizes for each strand orientation \cite{jin2013highres}.
The two same-strand distributions are averaged for convenience.
At high insert sizes, the distributions will converge for all strand orientations.
This is consistent with random ligation between two separate restriction fragments.
At lower insert sizes, spikes are observed in the ouward- and inward-facing distributions due to self-circularization and dangling ends, respectively.
Thresholds should be chosen in \Rfunction{prunePairs} to remove these spikes, as represented by the grey lines.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlcom{# Constructing the histograms for each orientation.}
\hlstd{llinsert} \hlkwb{<-} \hlkwd{log2}\hlstd{(diags}\hlopt{$}\hlstd{insert} \hlopt{+} \hlnum{1L}\hlstd{)}
\hlstd{intra} \hlkwb{<-} \hlopt{!}\hlkwd{is.na}\hlstd{(llinsert)}
\hlstd{breaks} \hlkwb{<-} \hlkwd{seq}\hlstd{(}\hlkwd{min}\hlstd{(llinsert[intra]),} \hlkwd{max}\hlstd{(llinsert[intra]),} \hlkwc{length.out}\hlstd{=}\hlnum{30}\hlstd{)}
\hlstd{inward} \hlkwb{<-} \hlkwd{hist}\hlstd{(llinsert[diags}\hlopt{$}\hlstd{orientation}\hlopt{==}\hlnum{1L}\hlstd{],} \hlkwc{plot}\hlstd{=}\hlnum{FALSE}\hlstd{,} \hlkwc{breaks}\hlstd{=breaks)}
\hlstd{outward} \hlkwb{<-} \hlkwd{hist}\hlstd{(llinsert[diags}\hlopt{$}\hlstd{orientation}\hlopt{==}\hlnum{2L}\hlstd{] ,}\hlkwc{plot}\hlstd{=}\hlnum{FALSE}\hlstd{,} \hlkwc{breaks}\hlstd{=breaks)}
\hlstd{samestr} \hlkwb{<-} \hlkwd{hist}\hlstd{(llinsert[diags}\hlopt{$}\hlstd{orientation}\hlopt{==}\hlnum{0L} \hlopt{|} \hlstd{diags}\hlopt{$}\hlstd{orientation}\hlopt{==}\hlnum{3L}\hlstd{],}
                \hlkwc{plot}\hlstd{=}\hlnum{FALSE}\hlstd{,} \hlkwc{breaks}\hlstd{=breaks)}
\hlstd{samestr}\hlopt{$}\hlstd{counts} \hlkwb{<-} \hlstd{samestr}\hlopt{$}\hlstd{counts}\hlopt{/}\hlnum{2}

\hlcom{# Setting up the axis limits.}
\hlstd{ymax} \hlkwb{<-} \hlkwd{max}\hlstd{(inward}\hlopt{$}\hlstd{counts, outward}\hlopt{$}\hlstd{counts, samestr}\hlopt{$}\hlstd{counts)}\hlopt{/}\hlnum{1e6}
\hlstd{xmax} \hlkwb{<-} \hlkwd{max}\hlstd{(inward}\hlopt{$}\hlstd{mids, outward}\hlopt{$}\hlstd{mids, samestr}\hlopt{$}\hlstd{mids)}
\hlstd{xmin} \hlkwb{<-} \hlkwd{min}\hlstd{(inward}\hlopt{$}\hlstd{mids, outward}\hlopt{$}\hlstd{mids, samestr}\hlopt{$}\hlstd{mids)}

\hlcom{# Making a plot with all orientations overlaid.}
\hlkwd{plot}\hlstd{(}\hlnum{0}\hlstd{,}\hlnum{0}\hlstd{,}\hlkwc{type}\hlstd{=}\hlstr{"n"}\hlstd{,} \hlkwc{xlim}\hlstd{=}\hlkwd{c}\hlstd{(xmin, xmax),} \hlkwc{ylim}\hlstd{=}\hlkwd{c}\hlstd{(}\hlnum{0}\hlstd{, ymax),}
     \hlkwc{xlab}\hlstd{=}\hlkwd{expression}\hlstd{(log[}\hlnum{2}\hlstd{]}\hlopt{~}\hlstr{"[insert size (bp)]"}\hlstd{),} \hlkwc{ylab}\hlstd{=}\hlstr{"Frequency (millions)"}\hlstd{)}
\hlkwd{lines}\hlstd{(inward}\hlopt{$}\hlstd{mids, inward}\hlopt{$}\hlstd{counts}\hlopt{/}\hlnum{1e6}\hlstd{,} \hlkwc{col}\hlstd{=}\hlstr{"darkgreen"}\hlstd{,} \hlkwc{lwd}\hlstd{=}\hlnum{2}\hlstd{)}
\hlkwd{abline}\hlstd{(}\hlkwc{v}\hlstd{=}\hlkwd{log2}\hlstd{(min.inward),} \hlkwc{col}\hlstd{=}\hlstr{"darkgrey"}\hlstd{)}
\hlkwd{lines}\hlstd{(outward}\hlopt{$}\hlstd{mids, outward}\hlopt{$}\hlstd{counts}\hlopt{/}\hlnum{1e6}\hlstd{,} \hlkwc{col}\hlstd{=}\hlstr{"red"}\hlstd{,} \hlkwc{lwd}\hlstd{=}\hlnum{2}\hlstd{)}
\hlkwd{abline}\hlstd{(}\hlkwc{v}\hlstd{=}\hlkwd{log2}\hlstd{(min.outward),} \hlkwc{col}\hlstd{=}\hlstr{"darkgrey"}\hlstd{,} \hlkwc{lty}\hlstd{=}\hlnum{2}\hlstd{)}
\hlkwd{lines}\hlstd{(samestr}\hlopt{$}\hlstd{mids, samestr}\hlopt{$}\hlstd{counts}\hlopt{/}\hlnum{1e6}\hlstd{,} \hlkwc{col}\hlstd{=}\hlstr{"blue"}\hlstd{,} \hlkwc{lwd}\hlstd{=}\hlnum{2}\hlstd{)}
\hlkwd{legend}\hlstd{(}\hlstr{"topright"}\hlstd{,} \hlkwd{c}\hlstd{(}\hlstr{"inward"}\hlstd{,} \hlstr{"outward"}\hlstd{,} \hlstr{"same"}\hlstd{),}
       \hlkwc{col}\hlstd{=}\hlkwd{c}\hlstd{(}\hlstr{"darkgreen"}\hlstd{,} \hlstr{"red"}\hlstd{,} \hlstr{"blue"}\hlstd{),} \hlkwc{lwd}\hlstd{=}\hlnum{2}\hlstd{)}
\end{alltt}
\end{kframe}\begin{adjustwidth}{2\fltoffset}{2\fltoffset}
\includegraphics[width=\maxwidth]{plots-ug/strorient-1} \end{adjustwidth}
\end{knitrout}

As an aside, the position of the spikes in the above plots can be used to estimate some fragment lengths.
The $x$-coordinate of the outward-facing spike represents the average length of the DNA fragments after restriction digestion.
This is useful as it provides a lower bound on the spatial resolution of any given Hi-C experiment.
The position of the inward-facing spike represents the average length of the fragments after sonication.
This should be lower than the size selection thresholds used in library preparation.

\section{Merging technical replicates}
Hi-C experiments often involve deep sequencing as read pairs are sparsely distributed across the set of possible interactions.
As a result, multiple index files may be generated from multiple technical replicates of a single Hi-C library.
These can be merged together using the \Rfunction{mergePairs} function prior to downstream processing.
This is equivalent to summing the counts for each pair of restriction fragment indices, and is valid if one assumes Poisson sampling for each sequencing run \cite{marioni2008rnaseq}.
An example is provided below that merges several technical replicates for a Hi-C library generated from GM06990 cells \cite{lieberman2009comprehensive}.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{prepped} \hlkwb{<-} \hlkwd{preparePairs}\hlstd{(}\hlstr{"SRR027958.bam"}\hlstd{, hs.param,} \hlkwc{file}\hlstd{=}\hlstr{"SRR027958.h5"}\hlstd{,}
                        \hlkwc{dedup}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{minq}\hlstd{=}\hlnum{10}\hlstd{)}
\hlstd{counted} \hlkwb{<-} \hlkwd{prunePairs}\hlstd{(}\hlstr{"SRR027958.h5"}\hlstd{, hs.param,} \hlkwc{file.out}\hlstd{=}\hlstr{"SRR027958_trimmed.h5"}\hlstd{,}
                      \hlkwc{max.frag}\hlstd{=}\hlnum{600}\hlstd{,} \hlkwc{min.inward}\hlstd{=min.inward,}
                      \hlkwc{min.outward}\hlstd{=min.outward)}
\hlkwd{mergePairs}\hlstd{(}\hlkwc{files}\hlstd{=}\hlkwd{c}\hlstd{(}\hlstr{"SRR027957_trimmed.h5"}\hlstd{,} \hlstr{"SRR027958_trimmed.h5"}\hlstd{),} \hlstr{"merged.h5"}\hlstd{)}
\end{alltt}
\end{kframe}
\end{knitrout}

In addition, any Hi-C dataset that is processed manually by the user can be stored in an index file using the \Rfunction{savePairs} function.
This takes a dataframe with the first and second anchor indices, as well as any additional information that might be useful.
The idea is to provide an entry point into the \Biocpkg{diffHic} analysis from other pipelines.
If the dataset is too large, one can save chunks at a time before merging them all together with \Rfunction{mergePairs}.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{anchor1.id} \hlkwb{<-} \hlkwd{as.integer}\hlstd{(}\hlkwd{runif}\hlstd{(}\hlnum{100}\hlstd{,} \hlnum{1}\hlstd{,} \hlkwd{length}\hlstd{(hs.param}\hlopt{$}\hlstd{fragments)))}
\hlstd{anchor2.id} \hlkwb{<-} \hlkwd{as.integer}\hlstd{(}\hlkwd{runif}\hlstd{(}\hlnum{100}\hlstd{,} \hlnum{1}\hlstd{,} \hlkwd{length}\hlstd{(hs.param}\hlopt{$}\hlstd{fragments)))}
\hlstd{dummy} \hlkwb{<-} \hlkwd{data.frame}\hlstd{(anchor1.id, anchor2.id,} \hlkwc{other.data}\hlstd{=}\hlkwd{runif}\hlstd{(}\hlnum{100}\hlstd{))}
\hlkwd{savePairs}\hlstd{(dummy,} \hlstr{"example.h5"}\hlstd{, hs.param)}
\end{alltt}
\end{kframe}
\end{knitrout}

For full compatibility, users should include the alignment positions and lengths for each read pair as \Rcode{anchorX.pos} and \Rcode{anchorX.len},
    where \Rcode{X} is 1 or 2 for each of the paired reads.
The alignment position refers to the 1-based coordinate of the left-most base of the alignment.
The alignment length refers to the span of the alignment relative to the reference, and should be negative for alignments on the reverse strand.
This information will be used in downstream \Biocpkg{diffHic} functions, such as read counting around blacklisted regions.

\section{Handling DNase Hi-C experiments}
DNase Hi-C is a variant of the standard protocol whereby the DNase~I enzyme is used to fragment the genome instead of restriction enzymes \cite{ma2015dnase}.
Random fragmentation provides resolution beyond that of individual restriction fragments.
However, cleavage sites for DNase~I cannot be easily predicted to construct \Rcode{param\$fragments}.
Fragment indices have no meaning here because there are no restriction fragments for reads to be assigned to.

Instead, \Biocpkg{diffHic} handles this type of data by operating directly on the alignment position of each read.
This reflects the theoretical base-pair resolution of the data during quantification.
To indicate that the reads come from a DNase Hi-C experiment, an empty \Rclass{GRanges} object should be supplied as the \Rcode{fragments} in the \Rclass{pairParam} object.
Most \Biocpkg{diffHic} functions will detect this and behave appropriately to exploit the improved resolution.
An example of this approach is shown below, where the SRR027957 library is treated as a DNase Hi-C sample.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{seg.frags} \hlkwb{<-} \hlkwd{emptyGenome}\hlstd{(BSgenome.Hsapiens.UCSC.hg19)}
\hlkwd{preparePairs}\hlstd{(}\hlstr{"SRR027957.bam"}\hlstd{,} \hlkwd{pairParam}\hlstd{(seg.frags),} \hlkwc{file}\hlstd{=}\hlstr{"SRR027957.h5"}\hlstd{,}
             \hlkwc{dedup}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{minq}\hlstd{=}\hlnum{10}\hlstd{)}
\end{alltt}
\begin{verbatim}
## $pairs
##    total   marked filtered   mapped
##  7068675   103594  1532760  5460120
##
## $singles
## [1] 0
##
## $chimeras
##   total  mapped   multi invalid
## 2495159 1779050 1073948   41582
\end{verbatim}
\end{kframe}
\end{knitrout}

Unlike \Rfunction{preparePairs}, no diagnostic information regarding self-circles or dangling ends is reported here.
Such metrics are irrelevant when restriction fragments are not involved.
Similarly, the \Rcode{length} field in the output of \Rfunction{getPairData} is set to \Rcode{NA} and should be ignored.
This is because the length of the sequencing fragment cannot be generally computed without knowledge of the fragmentation sites.
As a result, the \Rcode{max.frag} argument should also be set to \Rcode{NA} in \Rfunction{prunePairs}.
Metrics for inward- and outward-facing read pairs are unaffected, as these are computed independently of the fragments.
See the documentation for individual functions to determine if/how their behaviour changes for DNase Hi-C data.

Users should also note that the alignment script described in Section~\ref{sec:align} is not appropriate for DNase Hi-C experiments.
This approach is based on splitting chimeric reads at the ligation signature, which is constructed from the recognition site of a restriction enzyme.
The sequence around the ligation junction is not well-defined when DNase~I is used for cleavage.
Instead, alignment programs should be used that can directly handle chimeric reads with arbitrary breakpoints in the genome, e.g., \software{BWA} \cite{li2010fast}.
A Python script for iterative mapping \cite{imakaev2012iterative} is also provided for this purpose.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlkwd{system.file}\hlstd{(}\hlstr{"python"}\hlstd{,} \hlstr{"iter_map.py"}\hlstd{,} \hlkwc{package}\hlstd{=}\hlstr{"diffHic"}\hlstd{,} \hlkwc{mustWork}\hlstd{=}\hlnum{TRUE}\hlstd{)}
\end{alltt}
\end{kframe}
\end{knitrout}


\chapter{Counting read pairs into interactions}
\label{chap:counting}

\begin{combox}
A different dataset will be used here, so we don't need anything from the last chapter.
Horses for courses; this dataset's a lot nicer for detecting differential interactions.
\end{combox}

\section{Overview}
Prior to any statistical analysis, the read pairs in a Hi-C library must be summarized into a count for each interaction.
This count is used as an experimental measure of the interaction intensity.
Specifically, each pairwise interaction is parameterized by two genomic intervals representing the interacting loci.
The count for that interaction is defined as the number of read pairs with one read mapping to each of the intervals.
Counting is performed for each sample in the dataset, such that each interaction is associated with a set of counts.

The interaction space is defined as the genome-by-genome space over which the read pairs are distributed.
Recall that each paired read is assigned to a restriction fragment index.
The interaction space contains all index pairs $(x, y)$ for $x, y \in [1 .. N]$, where $x \ge y$ and $N$ is the number of restriction fragments in the genome.
This can be visualized as the triangular space between $y=x$, $y=0$ and $x=N$ on the Cartesian plane.
A rectangular area in the interaction space represents a pairwise interaction between the genomic intervals spanned by the two adjacent sides of that rectangle.
The number of read pairs in this area is used as the count for the corresponding interaction.
Non-rectangular areas can also represent interactions, but these are more difficult to interpret and will not be considered here.

The examples shown here will use the neural stem cell dataset \cite{sofueva2013cohesin} in which wild-type cells are compared to cohesin-deficient cells.
Read processing has already been performed to construct an index file for each library.
Some additional work is required to obtain the restriction fragment coordinates for the \textit{Hin}dIII-digested mouse genome.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlkwd{library}\hlstd{(BSgenome.Mmusculus.UCSC.mm10)}
\hlstd{mm.frag} \hlkwb{<-} \hlkwd{cutGenome}\hlstd{(BSgenome.Mmusculus.UCSC.mm10,} \hlstr{"AAGCTT"}\hlstd{,} \hlnum{4}\hlstd{)}
\hlstd{input} \hlkwb{<-} \hlkwd{c}\hlstd{(}\hlstr{"merged_flox_1.h5"}\hlstd{,} \hlstr{"merged_flox_2.h5"}\hlstd{,}
           \hlstr{"merged_ko_1.h5"}\hlstd{,} \hlstr{"merged_ko_2.h5"}\hlstd{)}
\end{alltt}
\end{kframe}
\end{knitrout}

\section{Counting into bin pairs}

\subsection{Overview}
Here, the genome is partitioned into contiguous non-overlapping bins of constant size.
Each interaction is defined as a pair of these bins.
This approach avoids the need for prior knowledge of the loci of interest when summarizing Hi-C counts.
Counting of read pairs between paired bins is performed for multiple libraries using the \Rfunction{squareCounts} function.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{bin.size} \hlkwb{<-} \hlnum{1e6}
\hlstd{mm.param} \hlkwb{<-} \hlkwd{pairParam}\hlstd{(mm.frag)}
\hlstd{data} \hlkwb{<-} \hlkwd{squareCounts}\hlstd{(input, mm.param,} \hlkwc{width}\hlstd{=bin.size,} \hlkwc{filter}\hlstd{=}\hlnum{1}\hlstd{)}
\hlstd{data}
\end{alltt}
\begin{verbatim}
## class: InteractionSet
## dim: 3319100 4
## metadata(2): param width
## assays(1): counts
## rownames: NULL
## rowData names(0):
## colnames: NULL
## colData names(1): totals
## type: ReverseStrictGInteractions
## regions: 2739
\end{verbatim}
\end{kframe}
\end{knitrout}

This generates an \Rclass{InteractionSet} object containing information for multiple genomic interactions \cite{lun2016infrastructure}.
Each row of the object corresponds to an interaction, i.e., bin pair, while each column represents a library.
For each interaction, the coordinates of its two ``anchor'' bins are returned.
The first/second anchor notation is used for these bins, whereby the first anchor bin is that with the ``higher'' genomic coordinate.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlkwd{head}\hlstd{(}\hlkwd{anchors}\hlstd{(data,} \hlkwc{type}\hlstd{=}\hlstr{"first"}\hlstd{))}
\end{alltt}
\begin{verbatim}
## GRanges object with 6 ranges and 1 metadata column:
##       seqnames             ranges strand |    nfrags
##          <Rle>          <IRanges>  <Rle> | <integer>
##   [1]     chr1 [3004106, 4000741]      * |       389
##   [2]     chr1 [3004106, 4000741]      * |       389
##   [3]     chr1 [4000738, 5001375]      * |       334
##   [4]     chr1 [4000738, 5001375]      * |       334
##   [5]     chr1 [4000738, 5001375]      * |       334
##   [6]     chr1 [5001372, 5997485]      * |       340
##   -------
##   seqinfo: 66 sequences from an unspecified genome
\end{verbatim}
\begin{alltt}
\hlkwd{head}\hlstd{(}\hlkwd{anchors}\hlstd{(data,} \hlkwc{type}\hlstd{=}\hlstr{"second"}\hlstd{))}
\end{alltt}
\begin{verbatim}
## GRanges object with 6 ranges and 1 metadata column:
##       seqnames             ranges strand |    nfrags
##          <Rle>          <IRanges>  <Rle> | <integer>
##   [1]     chr1 [      1, 3004109]      * |         1
##   [2]     chr1 [3004106, 4000741]      * |       389
##   [3]     chr1 [      1, 3004109]      * |         1
##   [4]     chr1 [3004106, 4000741]      * |       389
##   [5]     chr1 [4000738, 5001375]      * |       334
##   [6]     chr1 [      1, 3004109]      * |         1
##   -------
##   seqinfo: 66 sequences from an unspecified genome
\end{verbatim}
\end{kframe}
\end{knitrout}

The returned \Rclass{InteractionSet} object contains a count matrix with the number of read pairs for each interaction in each library.
It also contains a \Rcode{totals} vector in the \Rcode{colData}, which stores the total number of read pairs for each library.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlkwd{head}\hlstd{(}\hlkwd{assay}\hlstd{(data))}
\end{alltt}
\begin{verbatim}
##       [,1]  [,2]  [,3]  [,4]
## [1,]    83    48    33    19
## [2,] 21332 17151 12894 12357
## [3,]    20    20    17     6
## [4,]  8215  7023  4399  4237
## [5,] 14729 12460  8985  8443
## [6,]     7     2     3     0
\end{verbatim}
\begin{alltt}
\hlstd{data}\hlopt{$}\hlstd{totals}
\end{alltt}
\begin{verbatim}
## [1] 85786306 74685186 60860491 54596160
\end{verbatim}
\end{kframe}
\end{knitrout}

Bin pairs can also be filtered to remove those with to a count sum below \Rcode{filter}.
This removes uninformative bin pairs with very few read pairs, and reduces the memory footprint of the function.
A higher value of \Rcode{filter} may be necessary for analyses of large datasets in limited memory.
More sophisticated filtering strategies are discussed in Chapter~\ref{chap:filter}.

\subsection{Choosing a bin width}
The \Rcode{width} of the bin is specified in base pairs and determines the spatial resolution of the analysis.
Smaller bins will have greater spatial resolution as adjacent features can be distinguished in the interaction space.
Larger bins will have greater counts as a larger area is used to collect read pairs.
Optimal summarization will not be achieved if bins are too small or too large to capture the (changes in) intensity of the underlying interactions.

For this analysis, 1 Mbp bins are used to capture broad structural features.
This is also useful for demonstration purposes, as the counts are large enough for clear manifestation of biases in later chapters.
Of course, \Biocpkg{diffHic} is more than capable of handling smaller sizes (e.g., below 20 kbp) for higher-resolution analyses of looping interactions between specific elements.
In such cases, \Rcode{filter} should be increased to avoid excessive memory usage.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlkwd{head}\hlstd{(}\hlkwd{regions}\hlstd{(data))}
\end{alltt}
\begin{verbatim}
## GRanges object with 6 ranges and 1 metadata column:
##       seqnames             ranges strand |    nfrags
##          <Rle>          <IRanges>  <Rle> | <integer>
##   [1]     chr1 [      1, 3004109]      * |         1
##   [2]     chr1 [3004106, 4000741]      * |       389
##   [3]     chr1 [4000738, 5001375]      * |       334
##   [4]     chr1 [5001372, 5997485]      * |       340
##   [5]     chr1 [5997482, 7000260]      * |       342
##   [6]     chr1 [7000257, 8000015]      * |       349
##   -------
##   seqinfo: 66 sequences from an unspecified genome
\end{verbatim}
\end{kframe}
\end{knitrout}

The boundary of each bin is rounded to the closest restriction site in \Rfunction{squareCounts}.
This is due to the inherent limits on spatial resolution in a Hi-C experiment.
The number of restriction fragments in each bin is recorded in the \Rcode{nfrags} field of the metadata.

Determination of the ideal bin size is not trivial as the features of interest are not usually known in advance.
Instead, repeated analyses with multiple bin sizes are recommended.
This provides some robustness to the choice of bin size.
Sharp interactions can be detected by pairs of smaller bins while diffuse interactions can be detected by larger bin pairs.
See Section~\ref{sec:mergebins} for more information on consolidating results from multiple bin sizes.

\section{Counting with pre-defined regions}

\subsection{Connecting counts between pairs of regions}
For some studies, prior knowledge about the regions of interest may be available.
For example, a researcher may be interested in examining interactions between genes.
The coordinates can be obtained from existing annotation, as shown below for the mouse genome.
Other pre-specified regions can also be used, e.g., known enhancers or protein binding sites.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlkwd{library}\hlstd{(TxDb.Mmusculus.UCSC.mm10.knownGene)}
\hlstd{gene.body} \hlkwb{<-} \hlkwd{genes}\hlstd{(TxDb.Mmusculus.UCSC.mm10.knownGene)}
\hlkwd{strand}\hlstd{(gene.body)} \hlkwb{<-} \hlstr{"*"} \hlcom{# Removing strand info, for simplicity.}
\end{alltt}
\end{kframe}
\end{knitrout}

Counting is directly performed for these defined regions using the \Rfunction{connectCounts} function.
Interactions are defined between each pair of regions in the pre-specified set.
This may be easier to interpret than pairs of bins if the interacting regions have some biological significance.
The count matrix and the vector of totals are defined as previously described.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{redata} \hlkwb{<-} \hlkwd{connectCounts}\hlstd{(input, mm.param,} \hlkwc{regions}\hlstd{=gene.body)}
\hlstd{redata}
\end{alltt}
\begin{verbatim}
## class: InteractionSet
## dim: 12926724 4
## metadata(1): param
## assays(1): counts
## rownames: NULL
## rowData names(0):
## colnames: NULL
## colData names(1): totals
## type: ReverseStrictGInteractions
## regions: 24044
\end{verbatim}
\end{kframe}
\end{knitrout}

Again, first/second anchor notation applies whereby the interval with the larger start coordinate in the genome is defined as the first anchor.
Note that the anchor may not have a larger end coordinate if the supplied \Rcode{regions} are nested.
In addition, each region is rounded to the nearest restriction site.
Resorting is also performed, though the indices of the original regions can be found in the metadata as \Rcode{original} if back-referencing is necessary.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlkwd{head}\hlstd{(}\hlkwd{regions}\hlstd{(redata))}
\end{alltt}
\begin{verbatim}
## GRanges object with 6 ranges and 2 metadata columns:
##          seqnames             ranges strand |    nfrags  original
##             <Rle>          <IRanges>  <Rle> | <integer> <integer>
##   497097     chr1 [3211067, 3675240]      * |       197     15372
##    19888     chr1 [4286646, 4409818]      * |        49      7094
##    20671     chr1 [4487488, 4498343]      * |         2      7482
##    27395     chr1 [4772601, 4786029]      * |         3     13110
##    18777     chr1 [4802092, 4847111]      * |         8      6523
##    21399     chr1 [4856126, 4899085]      * |        16      8171
##   -------
##   seqinfo: 66 sequences (1 circular) from mm10 genome
\end{verbatim}
\end{kframe}
\end{knitrout}

One obvious limitation of this approach is that interactions involving unspecified regions will be ignored.
This is obviously problematic when searching for novel interacting loci.
Another issue is that the width of the regions cannot be easily changed.
This means that the compromise between spatial resolution and count size cannot be tuned.
For example, interactions will not be detected around smaller genes as the counts will be too small.
Conversely, interactions between distinct loci within a large gene body will not be resolved.

\subsection{Connecting counts between two sets of regions}
The \Rfunction{connectCounts} function can also be used to identify interactions between two sets of regions, by specifying a value for the \Rcode{second.regions} argument.
This only considers interactions between one entry in \Rcode{regions} and another in \Rcode{second.regions}.
This differs from the standard application of the function, which would consider an interaction between any pair of entries in \Rcode{regions}.
If an integer scalar is supplied as \Rcode{second.regions}, the second set is automatically defined as contiguous bins of that size across the genome.

The use of \Rcode{second.regions} is particularly useful in cases where there are defined ``viewpoints'' of interest, e.g., 4C-seq, Capture-C.
These viewpoints can be specified in \Rcode{regions}, as shown below for a set of mock probe locations for a hypothetical Capture-C experiment \cite{hughes2014analysis}.
Specifically, the viewpoint is defined as a 100 kbp bin centred at each capture probe.
The interaction profile across the rest of the genome can then be extracted by setting \Rcode{second.regions} to some bin size.
In this case, 100 kbp bins are used.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{probe.loc} \hlkwb{<-} \hlkwd{GRanges}\hlstd{(}\hlkwd{c}\hlstd{(}\hlstr{"chr1"}\hlstd{,} \hlstr{"chr2"}\hlstd{,} \hlstr{"chr3"}\hlstd{),}
                     \hlkwd{IRanges}\hlstd{(}\hlkwd{c}\hlstd{(}\hlnum{1e7}\hlstd{,} \hlnum{2e7}\hlstd{,} \hlnum{3e7}\hlstd{),} \hlkwd{c}\hlstd{(}\hlnum{1e7}\hlstd{,} \hlnum{2e7}\hlstd{,} \hlnum{3e7}\hlstd{)))}
\hlstd{viewpoints} \hlkwb{<-} \hlkwd{resize}\hlstd{(probe.loc,} \hlkwc{fix}\hlstd{=}\hlstr{"center"}\hlstd{,} \hlkwc{width}\hlstd{=}\hlnum{1e5}\hlstd{)}
\hlstd{viewdata} \hlkwb{<-} \hlkwd{connectCounts}\hlstd{(input, mm.param,} \hlkwc{regions}\hlstd{=viewpoints,}
                          \hlkwc{second.regions}\hlstd{=}\hlnum{1e5}\hlstd{)}
\hlkwd{head}\hlstd{(}\hlkwd{anchors}\hlstd{(viewdata,} \hlkwc{type}\hlstd{=}\hlstr{"first"}\hlstd{))}
\end{alltt}
\begin{verbatim}
## GRanges object with 6 ranges and 3 metadata columns:
##       seqnames              ranges strand |    nfrags is.second  original
##          <Rle>           <IRanges>  <Rle> | <integer> <logical> <integer>
##   [1]     chr1 [9945397, 10054278]      * |        38         0         1
##   [2]     chr1 [9945397, 10054278]      * |        38         0         1
##   [3]     chr1 [9945397, 10054278]      * |        38         0         1
##   [4]     chr1 [9945397, 10054278]      * |        38         0         1
##   [5]     chr1 [9945397, 10054278]      * |        38         0         1
##   [6]     chr1 [9945397, 10054278]      * |        38         0         1
##   -------
##   seqinfo: 66 sequences from an unspecified genome
\end{verbatim}
\begin{alltt}
\hlkwd{head}\hlstd{(}\hlkwd{anchors}\hlstd{(viewdata,} \hlkwc{type}\hlstd{=}\hlstr{"second"}\hlstd{))}
\end{alltt}
\begin{verbatim}
## GRanges object with 6 ranges and 3 metadata columns:
##       seqnames             ranges strand |    nfrags is.second  original
##          <Rle>          <IRanges>  <Rle> | <integer> <logical> <integer>
##   [1]     chr1 [      1, 3004109]      * |         1         1      <NA>
##   [2]     chr1 [3004106, 3100835]      * |        34         1      <NA>
##   [3]     chr1 [3100832, 3194461]      * |        30         1      <NA>
##   [4]     chr1 [3194458, 3301641]      * |        41         1      <NA>
##   [5]     chr1 [3301638, 3399158]      * |        52         1      <NA>
##   [6]     chr1 [3399155, 3501374]      * |        39         1      <NA>
##   -------
##   seqinfo: 66 sequences from an unspecified genome
\end{verbatim}
\end{kframe}
\end{knitrout}

As these results demonstrate, interactions are only considered if exactly one interacting locus is from the specified \Rcode{regions}.
The identity of the other locus (i.e., from \Rcode{second.regions}) can be determined based on the \Rcode{is.second} field in the \Rclass{GRanges} object.
This approach avoids loading irrelevant interactions when only specific viewpoints are of interest.

\section{Counting into single bins}
\label{sec:marginal}
The \Rfunction{marginalCounts} function counts the number of reads mapped inside each bin.
This effectively treats each Hi-C library as single-end data to quantify the genomic coverage of each bin.
One can use these ``marginal'' counts to determine whether there are systematic differences in coverage between libraries for a given bin.
This implies that copy number variations are present, which may confound detection of differential interactions.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{margin.data} \hlkwb{<-} \hlkwd{marginCounts}\hlstd{(input, mm.param,} \hlkwc{width}\hlstd{=bin.size)}
\hlstd{margin.data}
\end{alltt}
\begin{verbatim}
## class: RangedSummarizedExperiment
## dim: 2739 4
## metadata(1): param
## assays(1): counts
## rownames: NULL
## rowData names(1): nfrags
## colnames: NULL
## colData names(1): totals
\end{verbatim}
\end{kframe}
\end{knitrout}

Each row of the \Rclass{RangedSummarizedExperiment} contains the marginal read counts for a genomic bin.
For this dataset, there are no major changes in coverage for the vast majority of bins.
The most extreme events occur at low abundances and are unlikely to be precise.
This suggests that a direct comparison of interaction intensities will be valid.
Remedial action in the presence of copy number changes is not trivial and will be discussed in Section~\ref{sec:copy}.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{adjc} \hlkwb{<-} \hlkwd{cpm}\hlstd{(}\hlkwd{asDGEList}\hlstd{(margin.data),} \hlkwc{log}\hlstd{=}\hlnum{TRUE}\hlstd{,} \hlkwc{prior.count}\hlstd{=}\hlnum{5}\hlstd{)}
\hlkwd{smoothScatter}\hlstd{(}\hlnum{0.5}\hlopt{*}\hlstd{(adjc[,}\hlnum{1}\hlstd{]}\hlopt{+}\hlstd{adjc[,}\hlnum{3}\hlstd{]), adjc[,}\hlnum{1}\hlstd{]}\hlopt{-}\hlstd{adjc[,}\hlnum{3}\hlstd{],}
              \hlkwc{xlab}\hlstd{=}\hlstr{"A"}\hlstd{,} \hlkwc{ylab}\hlstd{=}\hlstr{"M"}\hlstd{,} \hlkwc{main}\hlstd{=}\hlstr{"Flox (1) vs. Ko (1)"}\hlstd{)}
\end{alltt}
\end{kframe}\begin{adjustwidth}{2\fltoffset}{2\fltoffset}
\includegraphics[width=\maxwidth]{plots-ug/mamargin-1} \end{adjustwidth}
\end{knitrout}

\section{Additional parameter options}

\subsection{Restricting the input chromosomes}
\label{sec:restrictchr}

Users can restrict counting to particular chromosomes by setting the \Rcode{restrict} slot in the \Rclass{pairParam} object.
This is useful to ensure that only interactions between relevant chromosomes are loaded.
Sequences such as the mitochondrial genome, unassigned contigs or random chromosome segments can be ignored in routine analyses.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{new.param} \hlkwb{<-} \hlkwd{reform}\hlstd{(mm.param,} \hlkwc{restrict}\hlstd{=}\hlkwd{c}\hlstd{(}\hlstr{"chr1"}\hlstd{,} \hlstr{"chr2"}\hlstd{))}
\hlstd{new.param}
\end{alltt}
\begin{verbatim}
## Genome contains 851636 restriction fragments across 66 chromosomes
## No discard regions are specified
## Read extraction is limited to 2 chromosomes
## No cap on the read pairs per pair of restriction fragments
\end{verbatim}
\end{kframe}
\end{knitrout}

In addition, if \Rcode{restrict} is a $n$-by-2 matrix, count loading will be limited to the read pairs that are mapped between the $n$ specified pairs of chromosomes.
The example below considers all read pairs mapped between chromosomes 2 and 19.
This feature is useful when memory is limited, as each pair of chromosomes can be loaded and analyzed separately.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{new.param} \hlkwb{<-} \hlkwd{reform}\hlstd{(mm.param,} \hlkwc{restrict}\hlstd{=}\hlkwd{cbind}\hlstd{(}\hlstr{"chr2"}\hlstd{,} \hlstr{"chr19"}\hlstd{))}
\hlstd{new.param}
\end{alltt}
\begin{verbatim}
## Genome contains 851636 restriction fragments across 66 chromosomes
## No discard regions are specified
## Read extraction is limited to pairs between:
## 	'chr2' and 'chr19'
## No cap on the read pairs per pair of restriction fragments
\end{verbatim}
\end{kframe}
\end{knitrout}

\subsection{Specifying regions to ignore}
Users can also discard alignments that lie within blacklisted regions by setting the \Rcode{discard} slot.
The aim is to eliminate reads within known repeat regions.
Such regions are problematic, as reads from several repeat units in the real genome may be collated into a single representative unit in the genome build.
This results in a sharp, spurious spike in interaction intensity.
The problem is exacerbated by different repeat copy numbers between biological conditions, resulting in spurious differential interactions due to changes in coverage.
Removal of reads in these repeats may be necessary to obtain reliable results.

\begin{knitrout}
\definecolor{shadecolor}{rgb}{0.941, 0.941, 0.941}\color{fgcolor}\begin{kframe}
\begin{alltt}
\hlstd{dummy.repeat} \hlkwb{<-} \hlkwd{GRanges}\hlstd{(}\hlstr{"chr1"}\hlstd{,} \hlkwd{IRanges}\hlstd{(}\hlnum{10000}\hlstd{,} \hlnum{1000000}\hlstd{))}
\hlstd{new.param} \hlkwb{<-} \hlkwd{reform}\hlstd{(mm.param,} \hlkwc{discard}\hlstd{=dummy.repeat)}
\hlstd{new.param}
\end{alltt}
\begin{verbatim}
## Genome contains 851636 restriction fragments across 66 chromosomes
## 1 region specified in which alignments are discarded
## No limits on chromosomes for read extraction
## No cap on the read pairs per pair of restriction fragments
\end{verbatim}
\end{kframe}
\end{knitrout}

Coordinates of annotated repeats can be obtained from several different sources.
A curated blacklist of problematic regions is available from the ENCODE project \cite{dunham2012encode}, and can be obtained at \url{https://sites.google.com/site/anshulkundaje/projects/blacklists}.
This list is constructed empirically from the ENCODE datasets and includes obvious offenders like telomeres, microsatellites and some rDNA genes.
Alternatively, repeats can be predicted from the genome sequence using software like RepeatMasker.
These calls are available from the UCSC website (e.g., \url{hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/chromOut.tar.gz} for mouse) or they can be extracted from an appropriate masked \Rclass{BSgenome} object.