## The R Data File in the /inst/extdata subdirectory was generated by running the OutSplice function
## on the example files in the /inst/extdata subdirectory using the hg19 build for annotations. The description as to
## how these files were made is documented below.

## The following script outlines the process used to create the trimmed example data from the TCGA used in the OutSplice vignette and man files

#1. First, download the zipped directories from the TGCA: http://firebrowse.org/?cohort=HNSC&download_dialog=true
# The directories are: "illuminahiseq_rnaseqv2-junction_quantification," "illuminahiseq_rnaseqv2-RSEM_genes_normalized," "mRNAseq_Preprocess"
# From these directories extract the following: "HNSC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__junction_quantification__data.data.txt", "HNSC.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt" ,"HNSC.uncv2.mRNAseq_raw_counts.txt"

#2. For ease, rename the files: "full_HNSC_junctions.txt", "full_HNSC_genes_normalized.txt", "full_HNSC_raw_counts_unc.txt"

#3. Set the working directory to the directory containing the files and read in the data
juncs_file<-read.table(file="full_HNSC_junctions.txt", sep='\t', header=T, stringsAsFactors = F)
RSEM_file<-read.table(file="full_HNSC_genes_normalized.txt", header=T, row.names=1, sep="\t")
rawcounts_file<-read.table(file="full_HNSC_raw_counts_unc.txt", sep='\t', header=T, row.names=1, stringsAsFactors = F)

#4. In the junction file, drop all rows not on chromosome 1. This is rows 25237-249578
juncs_file <- juncs_file[-(25237:249578),]

#5. Subset Example Tumors and Normals
library(dplyr)
colnames(rawcounts_file)<-gsub('\\.', "-", colnames(rawcounts_file))
colnames(juncs_file)<-gsub('\\.', "-", colnames(juncs_file))
colnames(RSEM_file)<-gsub('\\.', "-", colnames(RSEM_file))
rawcounts <- rawcounts_file %>% dplyr::select("TCGA-BA-5152-01",	"TCGA-BA-6869-01",	"TCGA-BA-A6DL-01",	"TCGA-BB-4227-01",	"TCGA-BB-7864-01",
                                      "TCGA-CN-4725-01",	"TCGA-CN-4727-01",	"TCGA-CN-4730-01",	"TCGA-CN-4731-01",	"TCGA-CN-4736-01",
                                      "TCGA-CN-4739-01",	"TCGA-CN-6988-01",	"TCGA-CN-A49C-01",	"TCGA-CN-A63W-01",	"TCGA-CN-A641-01",
                                      "TCGA-CQ-5327-01",	"TCGA-CQ-6227-01",	"TCGA-CQ-7065-01",	"TCGA-CQ-7069-01",	"TCGA-CQ-A4C9-01",
                                      "TCGA-CR-5247-01",	"TCGA-CR-6477-01",	"TCGA-CR-6480-01",	"TCGA-CR-7364-01",	"TCGA-CR-7365-01",
                                      "TCGA-CR-7369-01",	"TCGA-CR-7385-01",	"TCGA-CV-6933-11",	"TCGA-CV-6934-11",	"TCGA-CV-6935-11",
                                      "TCGA-CV-6936-11",	"TCGA-CV-6938-11",	"TCGA-CV-6939-11",	"TCGA-CV-6943-11",	"TCGA-CV-6952-01",
                                      "TCGA-CV-6955-11",	"TCGA-CV-6956-01",	"TCGA-CV-6956-11",	"TCGA-CV-6959-11",	"TCGA-CV-6960-11",
                                      "TCGA-CV-6961-11",	"TCGA-CV-6962-11",	"TCGA-CV-7089-01",	"TCGA-CV-7091-11",	"TCGA-CV-7095-01",
                                      "TCGA-CV-7097-11",	"TCGA-CV-7101-11",	"TCGA-CV-7103-11",	"TCGA-CV-7177-11",	"TCGA-CV-7178-11",
                                      "TCGA-CV-7183-11",	"TCGA-CV-7235-11",	"TCGA-CV-7238-11",	"TCGA-CV-7242-01",	"TCGA-CV-7242-11",
                                      "TCGA-CV-7245-11",	"TCGA-CV-7250-01",	"TCGA-CV-7250-11",	"TCGA-CV-7252-11",	"TCGA-CV-7253-01",
                                      "TCGA-CV-7255-01",	"TCGA-CV-7255-11",	"TCGA-CV-7261-11",	"TCGA-CV-7406-11",	"TCGA-CV-7409-01",
                                      "TCGA-CV-7416-11",	"TCGA-CV-7423-11",	"TCGA-CV-7424-11",	"TCGA-CV-7425-11",	"TCGA-CV-7432-11",
                                      "TCGA-CV-7434-11",	"TCGA-CV-7437-11",	"TCGA-CV-7438-11",	"TCGA-CV-7440-11",	"TCGA-CV-7568-01",
                                      "TCGA-CV-A45R-01",	"TCGA-CX-7086-01",	"TCGA-D6-A6EO-01",	"TCGA-F7-7848-01",	"TCGA-H7-A6C4-01",
                                      "TCGA-H7-A6C4-11",	"TCGA-H7-A6C5-11",	"TCGA-HD-7917-01",	"TCGA-HD-8224-01",	"TCGA-HD-8635-11",
                                      "TCGA-HD-A6HZ-11",	"TCGA-HD-A6I0-11",	"TCGA-IQ-7632-01",	"TCGA-P3-A5QF-01",	"TCGA-P3-A6T8-01",
                                      "TCGA-TN-A7HJ-01",	"TCGA-UF-A719-01",	"TCGA-UF-A71D-01",	"TCGA-WA-A7GZ-11")

all.junc <- juncs_file %>% dplyr::select("Hybridization-REF", "TCGA-BA-5152-01A-02R-1873-07",	"TCGA-BA-6869-01A-11R-1873-07",	"TCGA-BA-A6DL-01A-21R-A30B-07",	"TCGA-BB-4227-01A-01R-1873-07",	"TCGA-BB-7864-01A-11R-2232-07",
                                 "TCGA-CN-4725-01A-01R-1436-07",	"TCGA-CN-4727-01A-01R-1436-07",	"TCGA-CN-4730-01A-01R-1436-07",	"TCGA-CN-4731-01A-01R-1436-07",	"TCGA-CN-4736-01A-01R-1436-07",
                                 "TCGA-CN-4739-01A-02R-1514-07",	"TCGA-CN-6988-01A-11R-1915-07",	"TCGA-CN-A49C-01A-11R-A24H-07",	"TCGA-CN-A63W-01A-11R-A30B-07",	"TCGA-CN-A641-01A-11R-A30B-07",
                                 "TCGA-CQ-5327-01A-01R-1686-07",	"TCGA-CQ-6227-01A-11R-1915-07",	"TCGA-CQ-7065-01A-11R-2081-07",	"TCGA-CQ-7069-01A-11R-2403-07",	"TCGA-CQ-A4C9-01A-11R-A24Z-07",
                                 "TCGA-CR-5247-01A-01R-2016-07",	"TCGA-CR-6477-01A-11R-1873-07",	"TCGA-CR-6480-01A-11R-1873-07",	"TCGA-CR-7364-01A-11R-2016-07",	"TCGA-CR-7365-01A-11R-2016-07",
                                 "TCGA-CR-7369-01A-11R-2132-07",	"TCGA-CR-7385-01A-11R-2016-07",	"TCGA-CV-6933-11A-01R-1915-07",	"TCGA-CV-6934-11A-01R-1915-07",	"TCGA-CV-6935-11A-01R-1915-07",
                                 "TCGA-CV-6936-11A-01R-1915-07",	"TCGA-CV-6938-11A-01R-1915-07",	"TCGA-CV-6939-11A-01R-1915-07",	"TCGA-CV-6943-11A-01R-1915-07",	"TCGA-CV-6952-01A-11R-1915-07",
                                 "TCGA-CV-6955-11A-01R-2016-07",	"TCGA-CV-6956-01A-21R-2016-07",	"TCGA-CV-6956-11A-01R-2016-07",	"TCGA-CV-6959-11A-01R-1915-07",	"TCGA-CV-6960-11A-01R-2016-07",
                                 "TCGA-CV-6961-11A-01R-1915-07",	"TCGA-CV-6962-11A-01R-1915-07",	"TCGA-CV-7089-01A-11R-2016-07",	"TCGA-CV-7091-11A-01R-2016-07",	"TCGA-CV-7095-01A-21R-2016-07",
                                 "TCGA-CV-7097-11A-01R-2016-07",	"TCGA-CV-7101-11A-01R-2016-07",	"TCGA-CV-7103-11A-01R-2016-07",	"TCGA-CV-7177-11A-01R-2016-07",	"TCGA-CV-7178-11A-01R-2016-07",
                                 "TCGA-CV-7183-11A-01R-2016-07",	"TCGA-CV-7235-11A-01R-2016-07",	"TCGA-CV-7238-11A-01R-2016-07",	"TCGA-CV-7242-01A-11R-2016-07",	"TCGA-CV-7242-11A-01R-2016-07",
                                 "TCGA-CV-7245-11A-01R-2016-07",	"TCGA-CV-7250-01A-11R-2016-07",	"TCGA-CV-7250-11A-01R-2016-07",	"TCGA-CV-7252-11A-01R-2016-07",	"TCGA-CV-7253-01A-11R-2016-07",
                                 "TCGA-CV-7255-01A-11R-2016-07",	"TCGA-CV-7255-11A-01R-2016-07",	"TCGA-CV-7261-11A-01R-2016-07",	"TCGA-CV-7406-11A-01R-2081-07",	"TCGA-CV-7409-01A-31R-2232-07",
                                 "TCGA-CV-7416-11A-01R-2081-07",	"TCGA-CV-7423-11A-01R-2081-07",	"TCGA-CV-7424-11A-01R-2081-07",	"TCGA-CV-7425-11A-01R-2081-07",	"TCGA-CV-7432-11A-01R-2132-07",
                                 "TCGA-CV-7434-11A-01R-2132-07",	"TCGA-CV-7437-11A-01R-2132-07",	"TCGA-CV-7438-11A-01R-2132-07",	"TCGA-CV-7440-11A-01R-2187-07",	"TCGA-CV-7568-01A-11R-2232-07",
                                 "TCGA-CV-A45R-01A-11R-A24H-07",	"TCGA-CX-7086-01A-11R-2081-07",	"TCGA-D6-A6EO-01A-11R-A31N-07",	"TCGA-F7-7848-01A-11R-2132-07",	"TCGA-H7-A6C4-01A-11R-A30B-07",
                                 "TCGA-H7-A6C4-11A-21R-A466-07",	"TCGA-H7-A6C5-11A-11R-A30B-07",	"TCGA-HD-7917-01A-11R-2232-07",	"TCGA-HD-8224-01A-11R-2403-07",	"TCGA-HD-8635-11A-01R-2403-07",
                                 "TCGA-HD-A6HZ-11A-11R-A31N-07",	"TCGA-HD-A6I0-11A-11R-A31N-07",	"TCGA-IQ-7632-01A-11R-2081-07",	"TCGA-P3-A5QF-01A-11R-A28V-07",	"TCGA-P3-A6T8-01A-11R-A34R-07",
                                 "TCGA-TN-A7HJ-01A-12R-A34R-07",	"TCGA-UF-A719-01A-12R-A34R-07",	"TCGA-UF-A71D-01A-12R-A34R-07",	"TCGA-WA-A7GZ-11A-11R-A34R-07")
junctions_with_strand_info <- all.junc
junc_original_sample_names <- colnames(junctions_with_strand_info)
junc_original_sample_names <- junc_original_sample_names[-1]

all.RSEM <- RSEM_file %>% dplyr::select("TCGA-BA-5152-01A-02R-1873-07",	"TCGA-BA-6869-01A-11R-1873-07",	"TCGA-BA-A6DL-01A-21R-A30B-07",	"TCGA-BB-4227-01A-01R-1873-07",	"TCGA-BB-7864-01A-11R-2232-07",
                                "TCGA-CN-4725-01A-01R-1436-07",	"TCGA-CN-4727-01A-01R-1436-07",	"TCGA-CN-4730-01A-01R-1436-07",	"TCGA-CN-4731-01A-01R-1436-07",	"TCGA-CN-4736-01A-01R-1436-07",
                                "TCGA-CN-4739-01A-02R-1514-07",	"TCGA-CN-6988-01A-11R-1915-07",	"TCGA-CN-A49C-01A-11R-A24H-07",	"TCGA-CN-A63W-01A-11R-A30B-07",	"TCGA-CN-A641-01A-11R-A30B-07",
                                "TCGA-CQ-5327-01A-01R-1686-07",	"TCGA-CQ-6227-01A-11R-1915-07",	"TCGA-CQ-7065-01A-11R-2081-07",	"TCGA-CQ-7069-01A-11R-2403-07",	"TCGA-CQ-A4C9-01A-11R-A24Z-07",
                                "TCGA-CR-5247-01A-01R-2016-07",	"TCGA-CR-6477-01A-11R-1873-07",	"TCGA-CR-6480-01A-11R-1873-07",	"TCGA-CR-7364-01A-11R-2016-07",	"TCGA-CR-7365-01A-11R-2016-07",
                                "TCGA-CR-7369-01A-11R-2132-07",	"TCGA-CR-7385-01A-11R-2016-07",	"TCGA-CV-6933-11A-01R-1915-07",	"TCGA-CV-6934-11A-01R-1915-07",	"TCGA-CV-6935-11A-01R-1915-07",
                                "TCGA-CV-6936-11A-01R-1915-07",	"TCGA-CV-6938-11A-01R-1915-07",	"TCGA-CV-6939-11A-01R-1915-07",	"TCGA-CV-6943-11A-01R-1915-07",	"TCGA-CV-6952-01A-11R-1915-07",
                                "TCGA-CV-6955-11A-01R-2016-07",	"TCGA-CV-6956-01A-21R-2016-07",	"TCGA-CV-6956-11A-01R-2016-07",	"TCGA-CV-6959-11A-01R-1915-07",	"TCGA-CV-6960-11A-01R-2016-07",
                                "TCGA-CV-6961-11A-01R-1915-07",	"TCGA-CV-6962-11A-01R-1915-07",	"TCGA-CV-7089-01A-11R-2016-07",	"TCGA-CV-7091-11A-01R-2016-07",	"TCGA-CV-7095-01A-21R-2016-07",
                                "TCGA-CV-7097-11A-01R-2016-07",	"TCGA-CV-7101-11A-01R-2016-07",	"TCGA-CV-7103-11A-01R-2016-07",	"TCGA-CV-7177-11A-01R-2016-07",	"TCGA-CV-7178-11A-01R-2016-07",
                                "TCGA-CV-7183-11A-01R-2016-07",	"TCGA-CV-7235-11A-01R-2016-07",	"TCGA-CV-7238-11A-01R-2016-07",	"TCGA-CV-7242-01A-11R-2016-07",	"TCGA-CV-7242-11A-01R-2016-07",
                                "TCGA-CV-7245-11A-01R-2016-07",	"TCGA-CV-7250-01A-11R-2016-07",	"TCGA-CV-7250-11A-01R-2016-07",	"TCGA-CV-7252-11A-01R-2016-07",	"TCGA-CV-7253-01A-11R-2016-07",
                                "TCGA-CV-7255-01A-11R-2016-07",	"TCGA-CV-7255-11A-01R-2016-07",	"TCGA-CV-7261-11A-01R-2016-07",	"TCGA-CV-7406-11A-01R-2081-07",	"TCGA-CV-7409-01A-31R-2232-07",
                                "TCGA-CV-7416-11A-01R-2081-07",	"TCGA-CV-7423-11A-01R-2081-07",	"TCGA-CV-7424-11A-01R-2081-07",	"TCGA-CV-7425-11A-01R-2081-07",	"TCGA-CV-7432-11A-01R-2132-07",
                                "TCGA-CV-7434-11A-01R-2132-07",	"TCGA-CV-7437-11A-01R-2132-07",	"TCGA-CV-7438-11A-01R-2132-07",	"TCGA-CV-7440-11A-01R-2187-07",	"TCGA-CV-7568-01A-11R-2232-07",
                                "TCGA-CV-A45R-01A-11R-A24H-07",	"TCGA-CX-7086-01A-11R-2081-07",	"TCGA-D6-A6EO-01A-11R-A31N-07",	"TCGA-F7-7848-01A-11R-2132-07",	"TCGA-H7-A6C4-01A-11R-A30B-07",
                                "TCGA-H7-A6C4-11A-21R-A466-07",	"TCGA-H7-A6C5-11A-11R-A30B-07",	"TCGA-HD-7917-01A-11R-2232-07",	"TCGA-HD-8224-01A-11R-2403-07",	"TCGA-HD-8635-11A-01R-2403-07",
                                "TCGA-HD-A6HZ-11A-11R-A31N-07",	"TCGA-HD-A6I0-11A-11R-A31N-07",	"TCGA-IQ-7632-01A-11R-2081-07",	"TCGA-P3-A5QF-01A-11R-A28V-07",	"TCGA-P3-A6T8-01A-11R-A34R-07",
                                "TCGA-TN-A7HJ-01A-12R-A34R-07",	"TCGA-UF-A719-01A-12R-A34R-07",	"TCGA-UF-A71D-01A-12R-A34R-07",	"TCGA-WA-A7GZ-11A-11R-A34R-07")
rsem_with_all_genes <- all.RSEM

#6. To further trim the file, pre-filter the junctions to only include biologically relevant junctions. To do this we will source the OGSA scripts from OutSplice. Please edit the below code if your working directory is not the inst/script folder in the OutSplice pakcage.
setwd('../../R')
dir = getwd()
print(dir)
source(paste0(dir, '/OGSAfunctionwFisher.R'))
source(paste0(dir, '/outCallRank.R'))
setwd('../inst/script')

colnames(rawcounts)<-substr(colnames(rawcounts), 1,15)
colnames(rawcounts)<-gsub('\\.', "-", colnames(rawcounts))

#remove header; rename row names and junctions
all.junc<-all.junc[-1,] # remove header
all.RSEM<-all.RSEM[-1,] # remove header

all.samples<-colnames(all.junc)
all.samples<-substr(all.samples, 1, 15)
all.samples<-gsub('\\.', '-', all.samples)
junction.names<-all.junc[,1]
#remove duplicates
all.junc<-all.junc[!duplicated(junction.names),]
junction.names<-all.junc[,1]
j<-paste0(vapply(strsplit(junction.names,split=":"), function(x){x[[1]]}), ":", vapply(strsplit(junction.names,split=":"), function(x){x[[2]]}), "-",vapply(strsplit(junction.names,split=":"), function(x){x[[4]]}))

rownames(all.junc)<-j
colnames(all.junc)<-all.samples
#remove first column
all.junc<-all.junc[,-1]

expression.samples<-substr(colnames(all.RSEM), 1, 15)
expression.samples<-gsub('\\.', '-', expression.samples)
colnames(all.RSEM)<-expression.samples

#infer phenotype from sample names
all.samples<-intersect(colnames(all.junc), colnames(all.RSEM))
all.samples<-intersect(all.samples, colnames(rawcounts))
pheno<-substr(all.samples, 14,15)
names(pheno)<-all.samples
# include only primary tumor 01 or normal 11
pheno<-pheno[pheno=='01'|pheno=="11"]
pheno[pheno=='01']<-"Tumor"
pheno[pheno=='11']<-"Normal"
print(table(pheno))

if (sum(pheno=="Normal")<10){
  stop('Too few normal samples')
}
#subset only primary tumor 01 or normal 11
all.samples<-names(pheno)
all.junc<-all.junc[,all.samples]
all.RSEM<-all.RSEM[,all.samples]
rawcounts<-rawcounts[,all.samples]
pheno<-pheno[all.samples]

##change from char to numeric
n<-vapply(all.RSEM, as.numeric)
rownames(n)<-rownames(all.RSEM)
all.RSEM<-n
n<-vapply(all.junc, as.numeric)
rownames(n)<-rownames(all.junc)
all.junc<-n
remove(n)

## get data in RPM
print("convert to RPM")
totalrawcount<-colSums(rawcounts)
names(totalrawcount)<-colnames(rawcounts)
junc.RPM<-apply(all.junc[,all.samples], 1, function(x){x/totalrawcount[all.samples]*1000000})
junc.RPM<-t(junc.RPM)

print("filter the putative junctions")
### filter genes by cut off of
fcCutoff=10
Cutoff.ratio<-1-(1/fcCutoff)
# enforce overall Fold change
logFC <- apply(junc.RPM,1,function(x){(max(x)-min(x))/max(x)}) > Cutoff.ratio
junc.RPM <- junc.RPM[which(logFC),]

rm(logFC)
gc()

###### NEW FILTER: See if tumors have any outliers ###########################
##PHENO should have 'Normal' or 'Tumor' calls where Tumor ==1, Normal ==0, and names of each sample associated
PHENO<-pheno=='Tumor'
PHENO<-as.numeric(PHENO)
names(PHENO)<-names(pheno)

print("run the ogsa function for pre filtering")
## get function
test2<-dotheogsa(Sample.data=junc.RPM, PHENO=PHENO, offsets=0.1, dir = dir)
has.outliers<-test2[,"outRankTumor1"]>1|test2[,"outRankTumor2"]>1
junc.RPM<-junc.RPM[has.outliers,]

### Filter all genes on the X and Y chromosomes

junc.RPM <- junc.RPM[grep('chr[XY]',row.names(junc.RPM),value=T,invert=T),]

#7. Map Junctions in junc.RPM back to the original file to choose which junctions to keep
#remove header; rename row names and junctions
junctions_with_strand_info<-junctions_with_strand_info[-1,] # remove header

all.samples<-colnames(junctions_with_strand_info)
all.samples<-substr(all.samples, 1, 15)
all.samples<-gsub('\\.', '-', all.samples)
junction.names<-junctions_with_strand_info[,1]
#remove duplicates
junctions_with_strand_info<-junctions_with_strand_info[!duplicated(junction.names),]
junction.names<-junctions_with_strand_info[,1]
j<-paste0(vapply(strsplit(junction.names,split=":"), function(x){x[[1]]}), ":", vapply(strsplit(junction.names,split=":"), function(x){x[[2]]}), "-",vapply(strsplit(junction.names,split=":"), function(x){x[[4]]}))

rownames(junctions_with_strand_info)<-j
colnames(junctions_with_strand_info)<-all.samples

#Load List of Junctions that passed a pre-filter and merge
junctions_with_strand_info$id <- 1:nrow(junctions_with_strand_info)

final_junctions <- merge(junc.RPM, junctions_with_strand_info, by = 'row.names')

final_junctions <- final_junctions[order(final_junctions$id), ]

final_junctions <- final_junctions[, -c(1:95)]

rownames(final_junctions) <- final_junctions[, 1]
final_junctions <- final_junctions[, -1]
final_junctions <- final_junctions[, c(1:94)]
colnames(final_junctions)<-gsub(".y","",colnames(final_junctions))

#8. Append a TCGA Header Row to Retain Format
my_header <- rep(c('raw_counts'), each = 94)
final_junctions <- rbind(my_header, final_junctions)
rownames(final_junctions)[rownames(final_junctions) == '1'] <- "junction"
colnames(final_junctions) <- junc_original_sample_names

write.table(final_junctions, file = 'TCGA_HNSC_junctions.txt', sep = '\t', quote = FALSE, col.names = NA)

#9. Manually add a row header named "Hybridization REF" to the first column

#10. Use Genomic Ranges to find which genes to retain in the RSEM file. If a gene maps to multiple entrez ids, just the first id is retained
# create GenomicRanges object for junctions
detach("package:dplyr")
library('TxDb.Hsapiens.UCSC.hg19.knownGene')
library('org.Hs.eg.db')
chr <- vapply(strsplit(row.names(junc.RPM),split=":"), function(x){x[[1]]})
start <- as.numeric(vapply(strsplit(row.names(junc.RPM),split="[:-]"),
                           function(x){x[[2]]}))
end <- as.numeric(vapply(strsplit(row.names(junc.RPM),split="[:-]"),
                         function(x){x[[3]]}))


geneAnnot <- GRanges(seqnames=Rle(chr),
                     IRanges(start=start, end=end))
names(geneAnnot) <- row.names(junc.RPM)

geneAnnotAll <- GRanges(seqnames=Rle(chr),
                        IRanges(start=start, end=end))
names(geneAnnotAll) <- row.names(junc.RPM)

# get GenomicRanges object with genes for whole genome
gn <- genes(get('TxDb.Hsapiens.UCSC.hg19.knownGene'))

gSymbol <- select(get('org.Hs.eg.db'),keys=as.character(gn$gene_id),
                  columns=c('SYMBOL'),keytype='ENTREZID')
gn$SYMBOL <- gSymbol$SYMBOL
gn$ENTREZID <- gSymbol$ENTREZID
rm(gSymbol, chr, end, start)
gc()

# find symbols and ENTREZID for our junctions
overlap <- findOverlaps(geneAnnot,gn)

geneSYMBOLS <- tapply(gn$SYMBOL[subjectHits(overlap)],
                      queryHits(overlap),paste,collapse=';')

# add to genome ranges object
geneAnnot$SYMBOL <- NA
geneAnnot$SYMBOL[as.numeric(names(geneSYMBOLS))] <- geneSYMBOLS

geneENTREZID <- tapply(gn$ENTREZID[subjectHits(overlap)],
                       queryHits(overlap),paste,collapse=';')

geneAnnot$ENTREZID <- NA
geneAnnot$ENTREZID[as.numeric(names(geneENTREZID))] <- geneENTREZID

entrez_ids_to_keep <- vapply(strsplit(geneAnnot$ENTREZID, ";"), function(x){x[1]})
entrez_ids_to_keep <- unique(entrez_ids_to_keep)
entrez_ids_to_keep <- as.data.frame(entrez_ids_to_keep)
entrez_ids_to_keep <- na.omit(entrez_ids_to_keep)
rownames(entrez_ids_to_keep) <- entrez_ids_to_keep$entrez_ids_to_keep

#11. Map Entrez Ids to original RSEM file
rsem_with_all_genes$Hybridization_REF <- rownames(rsem_with_all_genes)
rsem_with_all_genes <- rsem_with_all_genes[-1, ]
rownames(rsem_with_all_genes) <- vapply(strsplit(row.names(rsem_with_all_genes), split ='\\|'), function(x){x[2]})

final_rsem <- merge(entrez_ids_to_keep, rsem_with_all_genes, by = 'row.names')
rownames(final_rsem) <- final_rsem$Hybridization_REF
final_rsem <- final_rsem[, -c(97)]
final_rsem <- final_rsem[, -c(1:2)]
final_rsem <- final_rsem[order(row.names(final_rsem)), ]

my_rsem_header <- rep(c('normalized_count'), each = 94)
final_rsem <- rbind(my_rsem_header, final_rsem)
rownames(final_rsem)[rownames(final_rsem) == '1'] <- "gene_id"

write.table(final_rsem, file = 'TCGA_HNSC_genes_normalized.txt', sep = '\t', quote = FALSE, col.names = NA)

#11. Manually add a row header named "Hybridization REF" to the first column

#12. Create rawcounts file that contains the total number rawcounts by summing the rawcounts for each gene in each sample in the full_HNSC_raw_counts_unc.txt file.
final_rawcounts <- as.data.frame(colSums(rawcounts))
final_rawcounts <- t(final_rawcounts)
rownames(final_rawcounts) <- 'total'
write.table(final_rawcounts, file = 'Total_Rawcounts.txt', sep = '\t', quote = FALSE, col.names = NA)

#13. Manually add a row header named "HYBRIDIZATION R" to the first column

#14. Create Junctions file for the regular OutSplice function by removing the header column and strand information from the junctions. Sample names here must match those in the rawcounts file.
all.junc <- read.table(file="TCGA_HNSC_junctions.txt", sep='\t', header=T, stringsAsFactors = F)

all.junc<-all.junc[-1,]
#remove header; rename row names and junctions

all.samples<-colnames(all.junc)
all.samples<-substr(all.samples, 1, 15)
all.samples<-gsub('\\.', '-', all.samples)
junction.names<-all.junc[,1]
#remove duplicates
all.junc<-all.junc[!duplicated(junction.names),]

junction.names<-all.junc[,1]
j<-paste0(vapply(strsplit(junction.names,split=":"), function(x){x[[1]]}), ":", vapply(strsplit(junction.names,split=":"), function(x){x[[2]]}), "-",vapply(strsplit(junction.names,split=":"), function(x){x[[4]]}))

rownames(all.junc)<-j
colnames(all.junc)<-all.samples

all.junc <- all.junc[,-1]

write.table(all.junc, file = 'HNSC_junctions.txt', sep = '\t', quote = FALSE, col.names = NA)

#15. Create normalized genes file for the regular OutSplice function by removing the header column. Sample names here must match those in the rawcounts file. Gene names must also be removed.
all.RSEM <- read.table(file='TCGA_HNSC_genes_normalized.txt', header=T, row.names=1, sep="\t")

all.RSEM <- all.RSEM[-1,]

expression.samples<-substr(colnames(all.RSEM), 1, 15)
expression.samples<-gsub('\\.', '-', expression.samples)
colnames(all.RSEM)<-expression.samples

rownames(all.RSEM) <- vapply(strsplit(row.names(all.RSEM), split ='\\|'), function(x){x[2]})

write.table(all.RSEM, file = 'HNSC_genes_normalized.txt', sep = '\t', quote = FALSE, col.names = NA)

#16. Same as above, manually add Hybridization REF as a header to the first columns in both the junctions and genes normalized files.

#17. The file HNSC_pheno_table.txt file was created by taking the 50 Tumor and 44 Normal Samples listed in the rawcounts vector above and labeling them as T for Tumors and F for Normals. Tumor and Normals were identified based on the TCGA labelling code at the end of the sample names: "01" for Tumor, and "11" for Normal.
