linkSet 1.0.0
30 October 2025
linkSet 1.0.0
This vignette provides a step-by-step guide to using the linkSet package to analyze Hi-C/HiChIP data. We will use a toy example to illustrate the main functions and workflows. Our goal is to identify the enhancer-gene links in this example.
We will use the following datasets as input:
We highly recommend you to use custom data instead of the example data provided in this vignette.
suppressPackageStartupMessages({
  library(linkSet)
  # Load additional packages only if available
  if (requireNamespace("TxDb.Mmusculus.UCSC.mm10.knownGene", quietly = TRUE)) {
    library(TxDb.Mmusculus.UCSC.mm10.knownGene)
  }
  if (requireNamespace("org.Mm.eg.db", quietly = TRUE)) {
    library(org.Mm.eg.db)
  }
  if (requireNamespace("Organism.dplyr", quietly = TRUE)) {
    library(Organism.dplyr)
  }
  if (requireNamespace("InteractionSet", quietly = TRUE)) {
    library(InteractionSet)
  }
  if (requireNamespace("rtracklayer", quietly = TRUE)) {
    library(rtracklayer)
  }
})We use our custom function readvalidPairs to load the example data.
Firstly, we need to load into GInteractions object.
hic_file <- system.file("extdata", "toyExample.pair.gz",
  package = "linkSet"
)
gi <- readvalidPairs(hic_file, format = "pair")
promoterGr <- tryCatch(
  {
    withTxDb("mm10", function(src) {
      genes <- Organism.dplyr::genes(src, columns = "symbol")
      IRanges::promoters(genes, upstream = 10000)
    })
  },
  error = function(e) {
    # Fallback approach using TxDb directly
    if (requireNamespace("TxDb.Mmusculus.UCSC.mm10.knownGene", quietly = TRUE) &&
      requireNamespace("org.Mm.eg.db", quietly = TRUE)) {
      txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene::TxDb.Mmusculus.UCSC.mm10.knownGene
      gene_ranges <- GenomicFeatures::genes(txdb)
      # Add gene symbols
      gene_ids <- names(gene_ranges)
      symbols <- AnnotationDbi::select(org.Mm.eg.db::org.Mm.eg.db,
        keys = gene_ids,
        columns = "SYMBOL",
        keytype = "ENTREZID"
      )
      symbol_map <- setNames(symbols$SYMBOL, symbols$ENTREZID)
      mcols(gene_ranges)$symbol <- symbol_map[names(gene_ranges)]
      genes <- gene_ranges[!is.na(mcols(gene_ranges)$symbol)]
      IRanges::promoters(genes, upstream = 10000)
    } else {
      stop("Required packages not available for mm10 annotation")
    }
  }
)
file_path <- system.file("extdata", "Embryo_body.bed.gz", package = "linkSet")
enhancer <- rtracklayer::import(file_path, format = "BED")Because the hic data only contains digist end, so we resize the region to upstream 5kb and downstream 5kb.
After that, we use baitGInteractions to generate the linkSet object.
gi <- resize(gi, 10000, fix = "center")
ls <- baitGInteractions(gi, promoterGr, enhancer, geneSymbol = "symbol")
ls
#> linkSet object with 27 interactions and 2 metadata columns:
#>               bait     seqnames_oe         ranges_oe | anchor1.symbol
#>        <character>           <Rle>         <IRanges> |    <character>
#>    [1]        Xkr4 ---        chr1 51753961-51754690 |           Xkr4
#>    [2]        Xkr4 ---        chr1 38494581-38496180 |           Xkr4
#>    [3]        Xkr4 ---        chr1 38496241-38496840 |           Xkr4
#>    [4]         Rp1 ---        chr1 14496641-14497000 |            Rp1
#>    [5]       Sox17 ---        chr1 90401091-90402520 |          Sox17
#>    ...         ... ...         ...               ... .            ...
#>   [23]      Npbwr1 ---        chr1 59014721-59015250 |         Npbwr1
#>   [24]      Rb1cc1 ---        chr1 45369781-45370210 |         Rb1cc1
#>   [25]      Rb1cc1 ---        chr1 33869921-33870570 |         Rb1cc1
#>   [26]      Rb1cc1 ---        chr1 94074541-94075190 |         Rb1cc1
#>   [27]      Alkal1 ---        chr1 12683361-12683590 |         Alkal1
#>            anchor2.name
#>             <character>
#>    [1] 11.4716435093611
#>    [2] 11.9206847715326
#>    [3] 11.1575156050835
#>    [4] 16.1891332694206
#>    [5] 11.3810398990553
#>    ...              ...
#>   [23] 11.2298655414852
#>   [24] 16.4355294570208
#>   [25] 11.2829690359357
#>   [26] 10.9286062722993
#>   [27] 14.0682990384515
#>   -------
#>   regions: 30 ranges and 0 metadata columns
#>   seqinfo: 66 sequences (1 circular) from mm10 genomeWhen we print the linkSet object, we can see the basic information of the linkSet object.
By default, we don’t show the bait region. But you are interested in the bait region, you can set showBaitRegion = TRUE.
showLinkSet(ls, baitRegion = TRUE)
#> linkSet object with 27 interactions and 2 metadata columns:
#>        bait seqnames_bait     ranges_bait     seqnames_oe         ranges_oe |
#>  [1]   Xkr4          chr1 3671299-3681498 ---        chr1 51753961-51754690 |
#>  [2]   Xkr4          chr1 3671299-3681498 ---        chr1 38494581-38496180 |
#>  [3]   Xkr4          chr1 3671299-3681498 ---        chr1 38496241-38496840 |
#>  [4]    Rp1          chr1 4409042-4419241 ---        chr1 14496641-14497000 |
#>  [5]  Sox17          chr1 4497155-4507354 ---        chr1 90401091-90402520 |
#>  ...    ...           ...             ... ...         ...               ... .
#> [23] Npbwr1          chr1 5917199-5927398 ---        chr1 59014721-59015250 |
#> [24] Rb1cc1          chr1 6196197-6206396 ---        chr1 45369781-45370210 |
#> [25] Rb1cc1          chr1 6196197-6206396 ---        chr1 33869921-33870570 |
#> [26] Rb1cc1          chr1 6196197-6206396 ---        chr1 94074541-94075190 |
#> [27] Alkal1          chr1 6349218-6359417 ---        chr1 12683361-12683590 |
#>      anchor1.symbol     anchor2.name
#>  [1]           Xkr4 11.4716435093611
#>  [2]           Xkr4 11.9206847715326
#>  [3]           Xkr4 11.1575156050835
#>  [4]            Rp1 16.1891332694206
#>  [5]          Sox17 11.3810398990553
#>  ...            ...              ...
#> [23]         Npbwr1 11.2298655414852
#> [24]         Rb1cc1 16.4355294570208
#> [25]         Rb1cc1 11.2829690359357
#> [26]         Rb1cc1 10.9286062722993
#> [27]         Alkal1 14.0682990384515Now, we can run diagnoseLinkSet to check the distance distribution and inter/intra interaction percentage. First, let’s annotate the promoter regions with genome information:
#> linkSet object with 27 interactions and 4 metadata columns:
#>               bait     seqnames_oe         ranges_oe | anchor1.symbol
#>        <character>           <Rle>         <IRanges> |    <character>
#>    [1]        Xkr4 ---        chr1 51753961-51754690 |           Xkr4
#>    [2]        Xkr4 ---        chr1 38494581-38496180 |           Xkr4
#>    [3]        Xkr4 ---        chr1 38496241-38496840 |           Xkr4
#>    [4]         Rp1 ---        chr1 14496641-14497000 |            Rp1
#>    [5]       Sox17 ---        chr1 90401091-90402520 |          Sox17
#>    ...         ... ...         ...               ... .            ...
#>   [23]      Npbwr1 ---        chr1 59014721-59015250 |         Npbwr1
#>   [24]      Rb1cc1 ---        chr1 45369781-45370210 |         Rb1cc1
#>   [25]      Rb1cc1 ---        chr1 33869921-33870570 |         Rb1cc1
#>   [26]      Rb1cc1 ---        chr1 94074541-94075190 |         Rb1cc1
#>   [27]      Alkal1 ---        chr1 12683361-12683590 |         Alkal1
#>            anchor2.name  inter_type  distance
#>             <character> <character> <integer>
#>    [1] 11.4716435093611       inter  48077927
#>    [2] 11.9206847715326       inter  34818982
#>    [3] 11.1575156050835       inter  34820142
#>    [4] 16.1891332694206       inter  10082679
#>    [5] 11.3810398990553       inter  85899551
#>    ...              ...         ...       ...
#>   [23] 11.2298655414852       inter  53092687
#>   [24] 16.4355294570208       inter  39168699
#>   [25] 11.2829690359357       inter  27668949
#>   [26] 10.9286062722993       inter  87873569
#>   [27] 14.0682990384515       inter   6329158
#>   -------
#>   regions: 30 ranges and 0 metadata columns
#>   seqinfo: 66 sequences (1 circular) from mm10 genomeIntrachromosomal interaction and long distance interaction are likely be noise, so we filter them.
ls <- countInteractibility(ls)
ls <- filterLinks(ls, filter_intra = TRUE, filter_unannotate = TRUE, distance = 50000000)Duplicated links are associated with contact frequency, so it’s a good idea to count duplicated links.
ls <- countInteractions(ls)
orderLinks(ls, by = "count", decreasing = TRUE)
#> linkSet object with 15 interactions and 5 metadata columns:
#>               bait     seqnames_oe         ranges_oe | anchor1.symbol
#>        <character>           <Rle>         <IRanges> |    <character>
#>    [1]        Xkr4 ---        chr1 51753961-51754690 |           Xkr4
#>    [2]        Xkr4 ---        chr1 38494581-38496180 |           Xkr4
#>    [3]        Xkr4 ---        chr1 38496241-38496840 |           Xkr4
#>    [4]         Rp1 ---        chr1 14496641-14497000 |            Rp1
#>    [5]       Sox17 ---        chr1   4561881-4562150 |          Sox17
#>    ...         ... ...         ...               ... .            ...
#>   [11]       Rgs20 ---        chr1 44099101-44099360 |          Rgs20
#>   [12]      Npbwr1 ---        chr1 39818891-39819140 |         Npbwr1
#>   [13]      Rb1cc1 ---        chr1 45369781-45370210 |         Rb1cc1
#>   [14]      Rb1cc1 ---        chr1 33869921-33870570 |         Rb1cc1
#>   [15]      Alkal1 ---        chr1 12683361-12683590 |         Alkal1
#>            anchor2.name  inter_type     count  distance
#>             <character> <character> <integer> <integer>
#>    [1] 11.4716435093611       inter         1  48077927
#>    [2] 11.9206847715326       inter         1  34818982
#>    [3] 11.1575156050835       inter         1  34820142
#>    [4] 16.1891332694206       inter         1  10082679
#>    [5]  13.507493224644       inter         1     59761
#>    ...              ...         ...       ...       ...
#>   [11] 11.3026423239041       inter         1  39024045
#>   [12] 12.9148641505382       inter         1  33896717
#>   [13] 16.4355294570208       inter         1  39168699
#>   [14] 11.2829690359357       inter         1  27668949
#>   [15] 14.0682990384515       inter         1   6329158
#>   -------
#>   regions: 20 ranges and 0 metadata columns
#>   seqinfo: 66 sequences (1 circular) from mm10 genomeWe can notice that there is a significant link strength between Sulf1 and chr1:12785091-12785750.
Enhancers that regulate multiple genes are biologically meaningful.
ls <- crossGeneEnhancer(ls)
ls <- orderLinks(ls, by = "crossFreq", decreasing = TRUE)
ls
#> linkSet object with 15 interactions and 6 metadata columns:
#>               bait     seqnames_oe         ranges_oe | anchor1.symbol
#>        <character>           <Rle>         <IRanges> |    <character>
#>    [1]       Sox17 ---        chr1   4561881-4562150 |          Sox17
#>    [2]      Lypla1 ---        chr1   4561881-4562150 |         Lypla1
#>    [3]      Mrpl15 ---        chr1   4561881-4562150 |         Mrpl15
#>    [4]      Lypla1 ---        chr1 33869921-33870570 |         Lypla1
#>    [5]      Mrpl15 ---        chr1 33869921-33870570 |         Mrpl15
#>    ...         ... ...         ...               ... .            ...
#>   [11]      Lypla1 ---        chr1 34163531-34164100 |         Lypla1
#>   [12]       Rgs20 ---        chr1 44099101-44099360 |          Rgs20
#>   [13]      Npbwr1 ---        chr1 39818891-39819140 |         Npbwr1
#>   [14]      Rb1cc1 ---        chr1 45369781-45370210 |         Rb1cc1
#>   [15]      Alkal1 ---        chr1 12683361-12683590 |         Alkal1
#>            anchor2.name  inter_type     count  distance crossFreq
#>             <character> <character> <integer> <integer> <integer>
#>    [1]  13.507493224644       inter         1     59761         3
#>    [2]  13.507493224644       inter         1    240872         3
#>    [3]  13.507493224644       inter         1    228624         3
#>    [4] 11.2829690359357       inter         1  29067358         3
#>    [5] 11.2829690359357       inter         1  29079606         3
#>    ...              ...         ...       ...       ...       ...
#>   [11] 11.2903880454209       inter         1  29360928         1
#>   [12] 11.3026423239041       inter         1  39024045         1
#>   [13] 12.9148641505382       inter         1  33896717         1
#>   [14] 16.4355294570208       inter         1  39168699         1
#>   [15] 14.0682990384515       inter         1   6329158         1
#>   -------
#>   regions: 20 ranges and 0 metadata columns
#>   seqinfo: 66 sequences (1 circular) from mm10 genomeWe can use plot_genomic_ranges to visualize the cross gene links.
We can also choose to visualze the bait center region.
# Check if regionsBait exists and has names
if (!is.null(regionsBait(ls)) && !is.null(names(regionsBait(ls)))) {
  # Get available bait names from regionsBait
  available_baits <- unique(names(regionsBait(ls)))
  if (length(available_baits) > 0) {
    # Use the first available bait for demonstration
    plot_genomic_ranges(ls, showBait = available_baits[1])
  } else {
    cat("No named bait regions available for plotting")
  }
} else {
  # If no regionsBait, just plot without showBait
  plot_genomic_ranges(ls)
}sessionInfo()
#> R version 4.5.1 Patched (2025-08-23 r88802)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.3 LTS
#> 
#> Matrix products: default
#> BLAS:   /home/biocbuild/bbs-3.22-bioc/R/lib/libRblas.so 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_GB              LC_COLLATE=C              
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: America/New_York
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats4    stats     graphics  grDevices utils     datasets  methods  
#> [8] base     
#> 
#> other attached packages:
#>  [1] rtracklayer_1.70.0                       
#>  [2] InteractionSet_1.38.0                    
#>  [3] SummarizedExperiment_1.40.0              
#>  [4] MatrixGenerics_1.22.0                    
#>  [5] matrixStats_1.5.0                        
#>  [6] Organism.dplyr_1.38.0                    
#>  [7] AnnotationFilter_1.34.0                  
#>  [8] dplyr_1.1.4                              
#>  [9] org.Mm.eg.db_3.22.0                      
#> [10] TxDb.Mmusculus.UCSC.mm10.knownGene_3.10.0
#> [11] GenomicFeatures_1.62.0                   
#> [12] AnnotationDbi_1.72.0                     
#> [13] Biobase_2.70.0                           
#> [14] linkSet_1.0.0                            
#> [15] GenomicRanges_1.62.0                     
#> [16] Seqinfo_1.0.0                            
#> [17] IRanges_2.44.0                           
#> [18] S4Vectors_0.48.0                         
#> [19] BiocGenerics_0.56.0                      
#> [20] generics_0.1.4                           
#> [21] BiocStyle_2.38.0                         
#> 
#> loaded via a namespace (and not attached):
#>  [1] tidyselect_1.2.1         farver_2.1.2             blob_1.2.4              
#>  [4] S7_0.2.0                 filelock_1.0.3           Biostrings_2.78.0       
#>  [7] bitops_1.0-9             fastmap_1.2.0            lazyeval_0.2.2          
#> [10] RCurl_1.98-1.17          BiocFileCache_3.0.0      GenomicAlignments_1.46.0
#> [13] XML_3.99-0.19            digest_0.6.37            lifecycle_1.0.4         
#> [16] KEGGREST_1.50.0          RSQLite_2.4.3            magrittr_2.0.4          
#> [19] compiler_4.5.1           rlang_1.1.6              sass_0.4.10             
#> [22] tools_4.5.1              yaml_2.3.10              data.table_1.17.8       
#> [25] knitr_1.50               labeling_0.4.3           S4Arrays_1.10.0         
#> [28] bit_4.6.0                curl_7.0.0               DelayedArray_0.36.0     
#> [31] RColorBrewer_1.1-3       abind_1.4-8              BiocParallel_1.44.0     
#> [34] withr_3.0.2              purrr_1.1.0              grid_4.5.1              
#> [37] ggplot2_4.0.0            scales_1.4.0             iterators_1.0.14        
#> [40] tinytex_0.57             dichromat_2.0-0.1        cli_3.6.5               
#> [43] rmarkdown_2.30           crayon_1.5.3             httr_1.4.7              
#> [46] rjson_0.2.23             DBI_1.2.3                cachem_1.1.0            
#> [49] parallel_4.5.1           BiocManager_1.30.26      XVector_0.50.0          
#> [52] restfulr_0.0.16          vctrs_0.6.5              Matrix_1.7-4            
#> [55] jsonlite_2.0.0           bookdown_0.45            patchwork_1.3.2         
#> [58] bit64_4.6.0-1            magick_2.9.0             foreach_1.5.2           
#> [61] jquerylib_0.1.4          glue_1.8.0               codetools_0.2-20        
#> [64] gtable_0.3.6             GenomeInfoDb_1.46.0      BiocIO_1.20.0           
#> [67] UCSC.utils_1.6.0         tibble_3.3.0             pillar_1.11.1           
#> [70] rappdirs_0.3.3           htmltools_0.5.8.1        R6_2.6.1                
#> [73] dbplyr_2.5.1             httr2_1.2.1              evaluate_1.0.5          
#> [76] lattice_0.22-7           png_0.1-8                Rsamtools_2.26.0        
#> [79] cigarillo_1.0.0          memoise_2.0.1            bslib_0.9.0             
#> [82] Rcpp_1.1.0               SparseArray_1.10.0       xfun_0.53               
#> [85] pkgconfig_2.0.3