#Illumina芯片分析
#导入原始数据
remove(list = ls())
setwd('H:/project')
library(GenomicFeatures)
library(methylumi)
library(lumi)#在安装lumi包的时候遇到一堆问题，methylumi是lumi的关联包，反正搞了好久也还是装上去了。。。。
## 首先是从illumina的芯片结果文件，自己用R的lumi包来获取表达矩阵。
GEO_id='GSE33877'
platform_id='GPL6947'
if(platform_id=='GPL6884'){
  illumina_official_anno=F
}else{
  illumina_official_anno=T
  
}
# tiss_type='other_blood_cell'
tiss_type='PBMC_from_Peripheral_blood'
outliers=NA
useGEO_file=F
doQualitycontrol=T
first_time_for_pdata=F
platform_seperate_relationship_file=T
platform_seperate_pdata=F
use_extra=F
extra_information='CD8'
threshold=0.25
seperate_group=T
#读取处理后的non_normalized_data
#分情况读取文件：第一种是lumi可以读取且列名为GSM的文件，第二种是原始文件提供了GSM列名，第三种是原始文件未提供GSM列名但是可以读取
if(use_extra==F){
  fileName=paste0('results/lumi_accessible_raw_data/',GEO_id,'_',platform_id,'_lumi_accessible_raw.txt')
}else{
  fileName=paste0('results/lumi_accessible_raw_data/',GEO_id,'_',platform_id,'_lumi_accessible_raw_',extra_information,'.txt')
}

file.raw=read.table(fileName,sep = '\t',header = T,fill = T,quote = "")
colnames(file.raw)[which(grepl(pattern = 'ID|ID_REF',colnames(file.raw)))]='PROBE_ID'
print(any(duplicated(file.raw$PROBE_ID)))
file.raw=file.raw[!duplicated(file.raw$PROBE_ID),]
rownames(file.raw)=file.raw$PROBE_ID
file.raw=file.raw[,-1]

#读取原始数据
setwd('H:/project')
x.lumi <- lumiR.batch(fileName,sep = '\t') 
lumi.N.Q=lumiExpresso(x.lumi,bg.correct = T,variance.stabilize = T,varianceStabilize.param = list('log2'),normalize = T,QC.evaluation = F)
expr=exprs(lumi.N.Q)


#质量控制
if(doQualitycontrol==T){
  #质量控制部分
  library(arrayQualityMetrics)
  data.qc=prepdata(expressionset = x.lumi,intgroup = c(),do.logtransform = T)
    bo=aqm.boxplot(data.qc)
    bo_stat=bo@outliers@statistic
    write.table(as.data.frame(bo_stat),file = paste0('results/',GEO_id,'_',platform_id,'_qc_boxplot_Ka_values.txt'),sep = '\t')
    hp=aqm.heatmap(data.qc)
    hp_stat=hp@outliers@statistic
    write.table(as.data.frame(bo_stat),file = paste0('results/',GEO_id,'_',platform_id,'_qc_distance_pheatmap_Sa_values.txt'),sep = '\t')
   aqm.writereport(modules = list(bo,hp),reporttitle = paste0('arrayQualityMetrics report for ',GEO_id,'_',platform_id),outdir = paste0('results/',GEO_id,'_',platform_id,'_QCreport'),arrayTable = data.qc$pData)
  #获取离群样本
  bo_outliers=names(bo@outliers@which)
  hp_outliers=names(bo@outliers@which)
  # ma_outliers=names(bo@outliers@which)
  outliers=intersect(bo_outliers,hp_outliers)
}

#生成一个去除离群值样本的临时原始文件，读取它用来后续分析然后再删除它
sub1=sapply(colnames(file.raw),function(x) {return(strsplit(x,'\\.')[[1]][1])}) #获取每个样本的GSM_id
tmp.output=file.raw[,!sub1%in%outliers] #排除掉outliers样本
ID_REF=rownames(tmp.output) 
tmp.output=cbind(ID_REF,tmp.output)
# library(dplyr)
# tmp.output=tmp.output %>% mutate(ID_REF=ID_REF) %>% select(ID_REF,everything())
write.table(tmp.output,file = paste0('results/lumi_accessible_raw_data/','tmp.txt'),sep = '\t',quote = F,row.names = F)
setwd('H:/project')
remove(x.lumi)
x.lumi <- lumiR.batch(paste0('results/lumi_accessible_raw_data/','tmp.txt'),sep = '\t')


file.remove(paste0('results/lumi_accessible_raw_data/','tmp.txt'))

#----------------------------------------------------------
file.raw=tmp.output
colnames(file.raw)[which(grepl(pattern = 'ID|ID_REF',colnames(file.raw)))]='PROBE_ID'
print(any(duplicated(file.raw$PROBE_ID)))
file.raw=file.raw[!duplicated(file.raw$PROBE_ID),]
rownames(file.raw)=file.raw$PROBE_ID
file.raw=file.raw[,-1]

#探针过滤
#根据原始数据筛选p值低的探针
#这里需要注意判断原始数据的p值是哪个版本的软件生成的，用箱线图可以简单看出，如果大部分探针p值都很小表明p值是越小越好的情况即为较新版本软件的结果，否则即为旧版本软件结果，需要对全部p值进行变换
probe_list=file.raw[,which(grepl('Detection',colnames(file.raw)))]
#probe_list=file.raw[,which(grepl('pvalue',colnames(file.raw)))]
probe_number_all=nrow(probe_list)

probe_list=as.data.frame(probe_list)
probe_list=sapply(probe_list,as.numeric)
rownames(probe_list)=rownames(file.raw)
boxplot(probe_list,names=F)
# probe_list=1-probe_list
median_rawdata=median(apply(probe_list,2,median))
if(seperate_group){
  probe_id_list=rownames(probe_list)
  if(first_time_for_pdata==T){
    #用GEOquery包下载GSE数据集的Series_matrix
    
    
    #数据处理：1.获取矩阵后对总体表达量进行矫正
    library(GEOquery)
    library(stringr)
    library(dplyr)
    # library(tidyverse)
    #网速不快才用，本地有文件就不用了
    # library(GEOmirror)
    # eSet=geoChina(GEO_id)
    #如果可以正常下载，就不用镜像了
    #下载文件
    gset=getGEO(filename = paste0('H:/HIV_microarray_data_GEO/',tiss_type,'/',platform_id,'_illumina/',GEO_id,'_series_matrix.txt.gz'),destdir = '.',GSEMatrix = T,AnnotGPL = F,getGPL = F)
    #查看pdata数据框的分组信息情况
    pdata=pData(gset[1])
    
    table(pdata$source_name_ch1)
    #如果series_matrix未提供信息，需要自己设置分组
    #手动分组 不推荐使用，只在分组不明确时用
    #读取分组文件
    group_table=read.table(paste0('data/',GEO_id,'_group.txt'),sep = '\t',header = T)
    pdata$sample_id=rownames(pdata)
    pdata=merge(pdata,group_table,by='sample_id')
    write.table(pdata,file = paste0('results/',GEO_id,'_pdata.txt'),sep = '\t',quote = F)
    sample2id=pdata[,which(colnames(pdata) %in% c('geo_accession','group'))]
    #根据分组列信息分配组名
    pdata=pdata[pdata$source_name_ch1=='PBMC samples at week 0',]
    pdata$group='HIV_ART'
    sample2id=pdata[,which(colnames(pdata) %in% c('geo_accession','group'))]
    write.table(pdata,file = paste0('results/',GEO_id,'_pdata.txt'),sep = '\t',quote = F)
    #强制指定每种方法的分组
    pdata=pdata %>%
      # filter(column!='c') %>%
      #mutate(group=ifelse(title=='Patient 1','HIV_nonART',ifelse(title=='Patient 2','HIV_ART',ifelse(title=='Patient 3','HIV_ART',NA))))
      mutate(group=ifelse(grepl('Control',source_name_ch1),'HIVfree',NA))
    setwd('H:/project')
    write.table(pdata,file = paste0('results/',GEO_id,'_pdata.txt'),sep = '\t',quote = F)
    sample2id=pdata[,which(colnames(pdata) %in% c('geo_accession','group'))]
    # write.table(exp1,file = paste0('results/',GEO_id,'_expression_matrix_all_group.txt'),sep = '\t',quote = F)
  }else{
    if(platform_seperate_pdata==T){
      pdata=read.table(paste0('H:/project/results/',tiss_type,'/illumina/matrix/',GEO_id,'_',platform_id,'_pdata.txt'),sep = '\t',fill = T,header = F,row.names = NULL)#写到这里了
      if(pdata[1,ncol(pdata)]==''){
        pdata[1,]=c('',pdata[1,-length(pdata[1,])])
        colnames(pdata)=pdata[1,]
        pdata=pdata[-1,]
        rownames(pdata)=pdata[,1]
        pdata=pdata[,-1]
      }else if(ncol(pdata)==2){
        colnames(pdata)=pdata[1,]
        pdata=pdata[-1,]
      }else{
        stop('Check pdata file, fix it manually if the problem is too complicated!')
      }
    }else{
      pdata=read.table(paste0('H:/project/results/',tiss_type,'/illumina/matrix/',GEO_id,'_pdata.txt'),sep = '\t',fill=T,header = F)#写到这里了
      # pdata=read.table(paste0('H:/project/results/',tiss_type,'/illumina/matrix/',GEO_id,'_',extra_information,'_pdata.txt'),sep = '\t',fill = T,header = F,row.names = NULL)#写到这里了
      if(pdata[1,ncol(pdata)]==''){
        pdata[1,]=c('',pdata[1,-length(pdata[1,])])
        colnames(pdata)=pdata[1,]
        pdata=pdata[-1,]
        rownames(pdata)=pdata[,1]
        pdata=pdata[,-1]
      }else if(ncol(pdata)==2){
        colnames(pdata)=pdata[1,]
        pdata=pdata[-1,]
      }else{
        stop('Check pdata file, fix it manually if the problem is too complicated!')
      }
    }
    sample2id=pdata[,which(colnames(pdata) %in% c('geo_accession','group'))]
  }
  library(stringr)
  colnames(probe_list)=sapply(colnames(probe_list),function(x){
    if(x %in% c('gene_symbol','ENSEMBL_id')){
      return(x)
    } else{
      return(str_extract(x,'GSM\\d*'))
    }
  })
  dict0=setNames(sample2id$group,sample2id$geo_accession)#生成一个表达矩阵列名于accession_id对应关系的字典
  # test1=paste0(dict0[colnames(probe_list)])
  colnames(probe_list)=paste0(dict0[colnames(probe_list)])
  group_tmp=unique(colnames(probe_list))
  sample_tmp=c()
  for(single_group_tmp in group_tmp){
    probe_list_sub=probe_list[,colnames(probe_list)%in%single_group_tmp]
    probe_remove_sub=rownames(probe_list_sub)[which(!rowSums(probe_list_sub<0.01)>=(threshold*ncol(probe_list_sub)))]
    sample_tmp=c(sample_tmp,probe_remove_sub)
  }
  sample_tmp=as.data.frame(table(sample_tmp))
  sample_tmp$sample_tmp=as.character(sample_tmp$sample_tmp)
  probe_filt=sample_tmp[which(sample_tmp$Freq==length(group_tmp)),][[1]]
  probe_keep=setdiff(probe_id_list,probe_filt)
  probe_filt_number=length(probe_filt)
  probe_keep_number=length(probe_keep)
  proportion_filter=probe_filt_number/probe_number_all
  
  filter_report=data.frame(number_of_all_probes=probe_number_all,number_of_probe_keep=probe_keep_number,number_of_probe_filt=probe_filt_number,proportion_filter=proportion_filter)
  rownames(filter_report)=GEO_id
}else{
  probe_keep=rownames(probe_list)[which(rowSums(probe_list<0.01)>=(threshold*ncol(probe_list)))] #阈值可以调整
  
  probe_keep_number=length(probe_keep)
  #查看被筛选掉的探针
  probe_filt=rownames(probe_list)[which(!rowSums(probe_list<0.01)>=(threshold*ncol(probe_list)))] #阈值可以调整
  
  probe_filt_number=length(probe_filt)
  
  proportion_filter=probe_filt_number/probe_number_all
  
  filter_report=data.frame(number_of_all_probes=probe_number_all,number_of_probe_keep=probe_keep_number,number_of_probe_filt=probe_filt_number,proportion_filter=proportion_filter)
  rownames(filter_report)=GEO_id
}

if(use_extra==T){
  write.table(filter_report,file=paste0('results/',GEO_id,'_',platform_id,'_',extra_information,'_filter_report.txt'),sep = '\t',row.names = T,quote = F)
  
}else{
  write.table(filter_report,file=paste0('results/',GEO_id,'_',platform_id,'_filter_report.txt'),sep = '\t',row.names = T,quote = F)
  
}
#----------------------------------------------------------








#获取患者临床信息并进行分组
if(first_time_for_pdata==T){
  #用GEOquery包下载GSE数据集的Series_matrix
  
  
  #数据处理：1.获取矩阵后对总体表达量进行矫正
  library(GEOquery)
  library(stringr)
  # library(tidyverse)
  #网速不快才用，本地有文件就不用了
  # library(GEOmirror)
  # eSet=geoChina(GEO_id)
  #如果可以正常下载，就不用镜像了
  #下载文件
  gset=getGEO(filename = paste0('H:/HIV_microarray_data_GEO/',tiss_type,'/',platform_id,'_affymetrix/',GEO_id,'_series_matrix.txt.gz'),destdir = '.',GSEMatrix = T,AnnotGPL = F,getGPL = F)
  #查看pdata数据框的分组信息情况
  pdata=pData(gset[1])
  
  table(pdata$source_name_ch1)
  #如果series_matrix未提供信息，需要自己设置分组
  #手动分组 不推荐使用，只在分组不明确时用
  #读取分组文件
  group_table=read.table(paste0('data/',GEO_id,'_group.txt'),sep = '\t',header = T)
  pdata$sample_id=rownames(pdata)
  pdata=merge(pdata,group_table,by='sample_id')
  write.table(pdata,file = paste0('results/',GEO_id,'_pdata.txt'),sep = '\t',quote = F)
  sample2id=pdata[,which(colnames(pdata) %in% c('geo_accession','group'))]
  #根据分组列信息分配组名
  pdata=pdata[pdata$source_name_ch1=='PBMC samples at week 0',]
  pdata$group='HIV_ART'
  sample2id=pdata[,which(colnames(pdata) %in% c('geo_accession','group'))]
  write.table(pdata,file = paste0('results/',GEO_id,'_pdata.txt'),sep = '\t',quote = F)
  #强制指定每种方法的分组
  pdata=pdata %>%
    # filter(column!='c') %>%
    #mutate(group=ifelse(title=='Patient 1','HIV_nonART',ifelse(title=='Patient 2','HIV_ART',ifelse(title=='Patient 3','HIV_ART',NA))))
    mutate(group=ifelse(grepl('CP',title),'HIV_nonART',ifelse(grepl('EC',title) ,'HIV_EC',NA)))
  setwd('H:/project')
  write.table(pdata,file = paste0('results/',GEO_id,'_pdata.txt'),sep = '\t',quote = F)
  sample2id=pdata[,which(colnames(pdata) %in% c('geo_accession','group'))]
  write.table(exp1,file = paste0('results/',GEO_id,'_expression_matrix_all_group.txt'),sep = '\t',quote = F)
}else{
  if(platform_seperate_pdata==T){
    pdata=read.table(paste0('H:/project/results/',tiss_type,'/illumina/matrix/',GEO_id,'_',platform_id,'_pdata.txt'),sep = '\t',fill = T,header = F,row.names = NULL)#写到这里了
    if(pdata[1,ncol(pdata)]==''){
      pdata[1,]=c('',pdata[1,-length(pdata[1,])])
      colnames(pdata)=pdata[1,]
      pdata=pdata[-1,]
      rownames(pdata)=pdata[,1]
      pdata=pdata[,-1]
    }else if(ncol(pdata)==2){
      colnames(pdata)=pdata[1,]
      pdata=pdata[-1,]
    }else{
      stop('Check pdata file, fix it manually if the problem is too complicated!')
    }
  }else{
    pdata=read.table(paste0('H:/project/results/',tiss_type,'/illumina/matrix/',GEO_id,'_pdata.txt'),sep = '\t',fill=T,header = F,row.names = NULL)#写到这里了
    # pdata=read.table(paste0('H:/project/results/',tiss_type,'/illumina/matrix/',GEO_id,'_',extra_information,'_pdata.txt'),sep = '\t',fill = T,header = F,row.names = NULL)#写到这里了
    if(pdata[1,ncol(pdata)]==''){
      pdata[1,]=c('',pdata[1,-length(pdata[1,])])
      colnames(pdata)=pdata[1,]
      pdata=pdata[-1,]
      rownames(pdata)=pdata[,1]
      pdata=pdata[,-1]
    }else if(ncol(pdata)==2){
      colnames(pdata)=pdata[1,]
      pdata=pdata[-1,]
    }else{
      stop('Check pdata file, fix it manually if the problem is too complicated!')
    }
  }
  sample2id=pdata[,which(colnames(pdata) %in% c('geo_accession','group'))]
}



#进行数据分析，得到表达矩阵
a=pData(phenoData(x.lumi))
Is_Essential=any(rownames(a)==a[rownames(a),1])#可以确定原始数据中芯片强度值以及Pvalue之间是否完全一致
if(Is_Essential==T){
  #将原始数据进行背景矫正，方差稳定性变换以及归一化得到表达矩阵
  lumi.N.Q=lumiExpresso(x.lumi,bg.correct = T,variance.stabilize = T,varianceStabilize.param = list('log2'),normalize = T,QC.evaluation = F)
}else{
  stop('Not all pval is paired with intensity!')}

data_expr=exprs(lumi.N.Q)
dim(data_expr)

#根据探针过滤的结果筛选探针
data_expr=data_expr[rownames(data_expr) %in% probe_keep,]
dim(data_expr)

#探针注释
#读取illumina官方提供的注释文件
# anno_official=read.table(file = 'data/annotation_file/illumina/HumanHT-12_V4_0_R2_15002873_B.txt',sep = '\t',quote = "",header = T,fill = T)
if(illumina_official_anno){
  if(platform_id=='GPL6947'){
    anno_official=read.table(file = 'data/annotation_file/illumina/HumanHT-12_V3_0_R3_11283641_A.txt',sep = '\t',quote = "",header = T,fill = T)
    
  }
  if(platform_id=='GPL10558'){
    anno_official=read.table(file = 'data/annotation_file/illumina/HumanHT-12_V4_0_R2_15002873_B.txt',sep = '\t',quote = "",header = T,fill = T)
    
  }
  ids_official=anno_official[,c('Probe_Id','Symbol')]
  # ids_official=anno_official[,c('Array_Address_Id','Symbol')]
  ids_official=ids_official[!apply(ids_official == "", 1, all),]
  colnames(ids_official)=c('probe_id','symbol')
  length(unique(ids_official$symbol[ids_official$symbol!=""]))
  ids_official=ids_official[ids_official$symbol!="",] 
  #上述代码保留了注释文件中有注释到某个symbol的探针
}else{
  library(GEOquery)
  library(stringr)
  annoset = getGEO(filename = paste0('H:/project/data/annotation_file/illumina/',platform_id,'.annot.gz'),destdir = ".")
  b = annoset@dataTable@table
  colnames(b)
  ids2 = b[,c("ID","Gene symbol")]#有的数据不一定写Symbol，可能写的是Gene Symbol，如果出现报错，可以在colname(b)里查看到底哪一列是注释
  colnames(ids2) = c("probe_id","symbol")
  #检测对应多个symbol的探针
  probe_multi=nrow(ids2[str_detect(ids2$symbol,"///"),])
  propotion1=probe_multi/nrow(ids2)

  # 使用strsplit函数将行名分割，得到一个列表
  split_names <- lapply(ids2$symbol, function(x) {
    if (grepl("///", x)) {
      return(strsplit(x, "///")[[1]])
    } else {
      return(x)
    }
  })

  # 创建一个新的数据框，每个被///分隔的字符串单独成为一行
  df_new <- ids2[rep(seq_len(nrow(ids2)), sapply(split_names, length)), ]
  # 使用unlist函数将列表转化为向量，并更新新数据框的行名
  df_new$symbol_new <- unlist(split_names)
  df_new=df_new[,-2]
  colnames(df_new)[2]='symbol_anno_GEO'
  ids_GEO=df_new
  ids_official=ids_GEO
  colnames(ids_official)=c('probe_id','symbol')
  length(unique(ids_official$symbol[ids_official$symbol!=""]))
  ids_official=ids_official[ids_official$symbol!="",] 
}


#把注释文件的symbol用HUGO进行注释
library(dplyr)
setwd('H:/project')
#读取HUGO注释跟要的文件
HUGO_full_table=read.table(file = 'data/NCBI_annotation_24_4_22.txt',sep = '\t',quote = "",header = T,fill = T)
HUGO_full_table.sub.specials=HUGO_full_table[HUGO_full_table$type=='Is alias/previous name of multi genes',] #挑选出同时为多个基因现用名但是本身不是现用名的基因
special.symbols=unique(HUGO_full_table.sub.specials$all_possible_query_names) #保存这些symbol
HUGO_full_table.sub=HUGO_full_table[!HUGO_full_table$type%in%c('Is alias/previous name of multi genes'),] #去除上面提及的symbol
HUGO_full_table.sub=HUGO_full_table.sub[!(HUGO_full_table.sub$type=='Is approved_symbol & alias/previous name of other gene'&HUGO_full_table.sub$Approved_symbol!=HUGO_full_table.sub$all_possible_query_names),]#有些基因本身是现用名，但是也是别的基因的别名，这种情况当作基因本身是现用名，不考虑它本身是别的基因的别名的情况
any(duplicated(HUGO_full_table.sub$all_possible_query_names)) #检查输入的symbol是否有重复值
dict_HUGO=setNames(HUGO_full_table.sub$Approved_symbol,HUGO_full_table.sub$all_possible_query_names) #构建搜索字典
ids_official$approved_symbol=sapply(ids_official$symbol,function(x)return(dict_HUGO[x])) #根据字典生成查询symbol对应的HUGO数据库中的现用名
ids_official$approved_symbol=ifelse(ids_official$symbol %in% special.symbols,ids_official$symbol,ids_official$approved_symbol)#对应多个现用名且自己不是某基因的现用名的基因，因为不知道到底对应哪个基因，就先保留它自己
ids_official$approved_symbol=ifelse(is.na(ids_official$approved_symbol),ids_official$symbol,ids_official$approved_symbol) #没有现用名的symbol，仍然保留它自己
gene_ids=ids_official[,c(1,3)]
# gene_ids=gene_ids[!is.na(gene_ids$approved_symbol),]
names(gene_ids)=c('probe_id','symbol')

#以下部分涉及GEO以及厂家注释文件的并集/交集部分，暂时用不上--------------------------------------------------------
# 两个注释文件取并集
# ids.merged=merge(ids_official,ids_GEO,by='probe_id',all=T) 
# ids.merged.1=ids.merged[!(is.na(ids.merged$symbol_official)&(ids.merged$symbol_anno_GEO=="")),]
# probe_avaliable=ids.merged.1$probe_id
# # write.table(ids.merged.1,file = 'results/2_21-probe_id_old_and_new_annotation.txt',sep = '\t',quote = F,row.names = F)
# 
# #分别导入两个注释文件中各个基因对应的官方现用名并与注释包中的进行对应,由于比对时存在多个探针对应一个基因的情况，返回的结果中存在相同的行，因此把原始symbol重复的行删掉
# uni_name_official=read.table(file = 'results/hgnc-symbol-check-illumina_official.txt',sep = '\t',quote = "",header = T)
# uni_name_GEO=read.table(file = 'results/hgnc-symbol-check-GEO_annotation.txt',sep = '\t',quote = "",header = T)
# #处理以下问题：某个symbol是某个基因的现用名以及另一个基因的别名 处理方法：如果该symbol的现用名是对的就删掉其他行
# duplicated_GEO=uni_name_GEO[duplicated(uni_name_GEO$gene_symbol)| duplicated(uni_name_GEO$gene_symbol,fromLast=T),]
# duplicated_official=uni_name_official[duplicated(uni_name_official$gene_symbol)| duplicated(uni_name_official$gene_symbol,fromLast=T),]
# duplicated_GEO=duplicated_GEO[which((duplicated_GEO$Match.type) %in%c('Alias symbol','Previous symbol')),]
# duplicated_GEO=duplicated_GEO[!duplicated(duplicated_GEO$gene_symbol),]
# duplicated_GEO=duplicated_GEO[duplicated(duplicated_GEO$gene_symbol)| duplicated(duplicated_GEO$gene_symbol,fromLast=T),]
# uni_name_official=uni_name_official[!duplicated(uni_name_official$gene_symbol),]
# uni_name_GEO=uni_name_GEO[!duplicated(uni_name_GEO$gene_symbol),]
# 
# #分别与注释包进行配对
# colnames(uni_name_official)[1]='symbol_official'
# colnames(uni_name_GEO)[1]='symbol_anno_GEO'
# ids_GEO.1=merge(ids_GEO[(ids_GEO$probe_id)%in%probe_avaliable,],uni_name_GEO[,c(1,3)],by='symbol_anno_GEO',all.x=T) #结果中有部分Approved_name为NA是因为用excel打开后部分输入的文件中的基因名被改了导致匹配不上，这种基因共56个（不排除多个探针对应同个基因的情况）
# ids_official.1=merge(ids_official[(ids_official$probe_id)%in%probe_avaliable,],uni_name_official[,c(1,3)],by='symbol_official',all.x=T)
# #去除原注释存在但查不到其现用名的探针
# ids_GEO.1=replace(ids_GEO.1,ids_GEO.1=="",NA)
# ids_official.1=replace(ids_official.1,ids_official.1=="",NA)
# # GEO_removed=ids_GEO.1[which(!is.na(ids_GEO.1$symbol_anno_GEO)&is.na(ids_GEO.1$Approved.symbol)),]
# # ids_GEO.1=ids_GEO.1[which(!(!is.na(ids_GEO.1$symbol_anno_GEO)&is.na(ids_GEO.1$Approved.symbol))),]
# # official_removed=ids_official.1[which(!is.na(ids_official.1$symbol_official)&is.na(ids_official.1$Approved.symbol)),]
# # ids_official.1=ids_official.1[which(!(!is.na(ids_official.1$symbol_official)&is.na(ids_official.1$Approved.symbol))),]
# #删除只保留各探针对应的现用名
# # ids_GEO.1=ids_GEO.1[!(ids_GEO.1$symbol_anno_GEO==""&ids_GEO.1$Approved.symbol==""),]
# # ids_official.1=ids_official.1[!(ids_official.1$symbol_official==""&ids_official.1$Approved.symbol==""),]
# 
# # #用以下策略合并两个注释文件 1.旧文件没有对应但是新的有对应，直接取代 2.新旧文件信息不一样，保留旧的 3.新旧文件信息一致，随便保留一个
# colnames(ids_GEO.1)[3]='Approved_symbol_GEO'
# colnames(ids_official.1)[3]='Approved_symbol_official'
# ids.compare=merge(ids_GEO.1,ids_official.1,by='probe_id',all=T)
# ids.compare=replace(ids.compare,ids.compare=="",NA)
# ids.compare.all.na=ids.compare[which(is.na(ids.compare$symbol_anno_GEO)&is.na(ids.compare$symbol_official)),]# GEO 官方文件以及对应现用名均不存在
# ids.compare.GEO_same_official_same_approve=ids.compare[which(!is.na(ids.compare$symbol_anno_GEO) & !is.na(ids.compare$symbol_official) & ids.compare$symbol_anno_GEO==ids.compare$symbol_official & ids.compare$Approved_symbol_GEO==ids.compare$Approved_symbol_official),]# GEO 官方文件 相同 对应的现用名相同
# ids.compare.GEO_same_official_no_approve=ids.compare[which(!is.na(ids.compare$symbol_anno_GEO) & !is.na(ids.compare$symbol_official) & ids.compare$symbol_anno_GEO==ids.compare$symbol_official&is.na(ids.compare$Approved_symbol_GEO)&is.na(ids.compare$Approved_symbol_official)),]# GEO 官方文件 相同 对应现用名不存在
# ids.compare.GEO_diff_official_official_approve_only=ids.compare[which(!is.na(ids.compare$symbol_anno_GEO) & !is.na(ids.compare$symbol_official) & ids.compare$symbol_anno_GEO!=ids.compare$symbol_official & is.na(ids.compare$Approved_symbol_GEO)& !is.na(ids.compare$Approved_symbol_official)),]# GEO 官方文件不同  GEO现用名不存在 官方现用名存在
# ids.compare.GEO_diff_official_GEO_approve_only=ids.compare[which(!is.na(ids.compare$symbol_anno_GEO) & !is.na(ids.compare$symbol_official) & ids.compare$symbol_anno_GEO != ids.compare$symbol_official & !is.na(ids.compare$Approved_symbol_GEO) & is.na(ids.compare$Approved_symbol_official)),]# GEO 官方文件不同  GEO现用名存在 官方现用名不存在
# ids.compare.GEO_diff_official_no_approve=ids.compare[which(!is.na(ids.compare$symbol_anno_GEO) & !is.na(ids.compare$symbol_official) & ids.compare$symbol_anno_GEO != ids.compare$symbol_official & is.na(ids.compare$Approved_symbol_GEO) & is.na(ids.compare$Approved_symbol_official)),]# GEO 官方文件不同 现用名不存在
# ids.compare.GEO_diff_official_approve_diff=ids.compare[which(!is.na(ids.compare$symbol_anno_GEO) & !is.na(ids.compare$symbol_official) & ids.compare$symbol_anno_GEO!=ids.compare$symbol_official & !is.na(ids.compare$Approved_symbol_GEO) & !is.na(ids.compare$Approved_symbol_official) & ids.compare$Approved_symbol_GEO != ids.compare$Approved_symbol_official),]# GEO 官方文件不同  GEO现用名存在 官方现用名存在 二者不同
# ids.compare.GEO_diff_official_approve_same=ids.compare[which(!is.na(ids.compare$symbol_anno_GEO) & !is.na(ids.compare$symbol_official) & ids.compare$symbol_anno_GEO!=ids.compare$symbol_official & !is.na(ids.compare$Approved_symbol_GEO) & !is.na(ids.compare$Approved_symbol_official) & ids.compare$Approved_symbol_GEO == ids.compare$Approved_symbol_official),]# GEO 官方文件不同  GEO现用名存在 官方现用名存在 二者相同
# ids.compare.GEO_empty_only_official_approve_no=ids.compare[which(is.na(ids.compare$symbol_anno_GEO) & !is.na(ids.compare$symbol_official) & is.na(ids.compare$Approved_symbol_official)),]# GEO 不存在 官方存在 官方现用名不存在
# ids.compare.GEO_empty_only_official_approve_exist=ids.compare[which(is.na(ids.compare$symbol_anno_GEO) & !is.na(ids.compare$symbol_official) & !is.na(ids.compare$Approved_symbol_official)),]# GEO 不存在 官方存在 官方现用名存在
# ids.compare.official_empty_only_GEO_approve_exist=ids.compare[which(!is.na(ids.compare$symbol_anno_GEO) & is.na(ids.compare$symbol_official) & !is.na(ids.compare$Approved_symbol_GEO)),]# GEO 存在 官方不存在 GEO现用名存在
# ids.compare.official_empty_only_GEO_approve_no=ids.compare[which(!is.na(ids.compare$symbol_anno_GEO) & is.na(ids.compare$symbol_official) & is.na(ids.compare$Approved_symbol_GEO)),]# GEO 存在 官方不存在 GEO现用名不存在
# ids.compare.GEO_same_official_GEO_approve_only=ids.compare[which(!is.na(ids.compare$symbol_anno_GEO) & !is.na(ids.compare$symbol_official) & ids.compare$symbol_anno_GEO==ids.compare$symbol_official & is.na(ids.compare$Approved_symbol_official) & !is.na(ids.compare$Approved_symbol_GEO)),]# GEO 官方 相同 官方现用名不存在，这里实际上是有问题的，手动调成两个现用名一样的情况
# probe_uni=c(ids.compare.all.na$probe_id,
#             ids.compare.GEO_diff_official_approve_diff$probe_id,
#             ids.compare.GEO_diff_official_approve_same$probe_id,
#             ids.compare.GEO_diff_official_GEO_approve_only$probe_id,
#             ids.compare.GEO_diff_official_official_approve_only$probe_id,
#             ids.compare.GEO_empty_only_official_approve_exist$probe_id,
#             ids.compare.GEO_empty_only_official_approve_no$probe_id,
#             ids.compare.GEO_same_official_no_approve$probe_id,
#             ids.compare.GEO_same_official_same_approve$probe_id,
#             ids.compare.official_empty_only_GEO_approve_exist$probe_id,
#             ids.compare.official_empty_only_GEO_approve_no$probe_id,
#             ids.compare.GEO_diff_official_no_approve$probe_id,
#             ids.compare.GEO_same_official_GEO_approve_only$probe_id)
# probe_ini.1=unique(probe_uni)
# # haha=ids.compare[ids.compare$probe_id%in%setdiff(probe_avaliable,probe_ini.1),]
# # 用以下规则更新探针的注释 1.两个文件注释相同且现用名相同的，直接使用现用名 2.只有一个文件有注释且有现用名的探针，用这个现用名 3. 两个文件均有注释且现用名不同的探针，使用GEO的探针的现用名
# #处理异常的情况，此时只剩下上面的12种情况
# ids.compare[which(!is.na(ids.compare$symbol_anno_GEO) & !is.na(ids.compare$symbol_official) & ids.compare$symbol_anno_GEO==ids.compare$symbol_official & is.na(ids.compare$Approved_symbol_official) & !is.na(ids.compare$Approved_symbol_GEO)),5]=ids.compare[which(!is.na(ids.compare$symbol_anno_GEO) & !is.na(ids.compare$symbol_official) & ids.compare$symbol_anno_GEO==ids.compare$symbol_official & is.na(ids.compare$Approved_symbol_official) & !is.na(ids.compare$Approved_symbol_GEO)),2]
# ids.final=ids.compare
# ids.final$approve_id=ifelse((!is.na(ids.final$Approved_symbol_GEO)),ids.final$Approved_symbol_GEO,ifelse((!is.na(ids.final$Approved_symbol_official)),ids.final$Approved_symbol_official,NA))
# gene_ids=ids.final[,c(1,6)]
# gene_ids=gene_ids[!is.na(gene_ids$approve_id),]
# colnames(gene_ids)=c('probe_id','symbol')
# gene_ids=gene_ids[!duplicated(gene_ids$probe_id),]
# ids.merged$id_keep=ifelse(is.na(ids.merged$symbol_new)&ids.merged$symbol=='',NA,ifelse(ids.merged$symbol==""&!is.na(ids.merged$symbol_new),ids.merged$symbol_new,ifelse(ids.merged$symbol_new==ids.merged$symbol,ids.merged$symbol_new,ids.merged$symbol)))
# ids.merged.1=ids.merged[!is.na(ids.merged$id_keep),]
#以上部分涉及两个注释文件取并集，暂时用不上----------------------------------------------------------------------


#除去表达矩阵中存在但是探针注释集中没有即没有对应基因以及涉及重叠基因的探针，这些探针对应不到任何记录中的基因
data_expr=data_expr[rownames(data_expr) %in% gene_ids$probe_id,]

#更改探针集合中顺序使得顺序与表达矩阵的一致
gene_ids=gene_ids[match(rownames(data_expr),gene_ids$probe_id),] #match函数是返回第一个输入的每个元素在第二个输入中的位置，这里将第二个输入即gene_ids中各行顺序按表达矩阵的行进行了重排

#通过probe_id将同symbol对应的多个探针进行分组
#计算每组探针表达量平均值，取均值最大的探针作为symbol对应的唯一探针
#此处的 by函数 作用是，根据第二个参数将第一个参数分成若干组
tmp = by(data_expr,
         gene_ids$symbol,
         function(x) rownames(x)[which.max(rowMeans(x))]) 
#还可以取探针中中位数最大的探针
# tmp2 = by(exp,
#           gene_ids$symbol,
#           function(x) rownames(x)[which.max(apply(x,1,median))])
probes = as.character(tmp)
dim(data_expr)
exp1 = data_expr[rownames(data_expr) %in% probes,] # 过滤有多个探针的基因
dim(exp1)

#这里先使用平均数最大的探针进行分析

rownames(exp1)=gene_ids[match(rownames(exp1),gene_ids$probe_id),2]


#根据genesymbol获取ENSEMBL_id
library(org.Hs.eg.db)
exp1=as.data.frame(exp1)
exp1$gene_symbol=rownames(exp1)
exp1$ENSEMBL_id<-sapply(mapIds(org.Hs.eg.db,
                               keys = exp1$gene_symbol,
                               column = 'ENSEMBL',
                               keytype = 'SYMBOL',
                               multiVals = 'list'),function(x) paste(x,collapse = ','))




#表达矩阵列名后面添加分组信息
dict=setNames(sample2id$group,sample2id$geo_accession)#生成一个表达矩阵列名于accession_id对应关系的字典
colnames(exp1)=paste0(colnames(exp1),'_',dict[colnames(exp1)])
colnames(exp1)[which(colnames(exp1)=='gene_symbol_NA')]='gene_symbol'
colnames(exp1)[which(colnames(exp1)=='ENSEMBL_id_NA')]='ENSEMBL_id'

setwd('H:/project')
if(platform_seperate_relationship_file==T){
  write.table(exp1,file = paste0('results/',GEO_id,'_',platform_id,'_expression_matrix_all_group_using_lumi_newpipeline_3_15.txt'),sep = '\t',quote = F)
  # write.table(exp1,file = paste0('results/',GEO_id,'_',platform_id,'_',extra_information,'_expression_matrix_all_group.txt'),sep = '\t',quote = F)
  
}else{
  write.table(exp1,file = paste0('results/',GEO_id,'_expression_matrix_all_group.txt'),sep = '\t',quote = F)
  
}
#制作每种分组的表达矩阵

generate1group_matrix=function(sample2id,input_matrix){
  group_type=unique(sample2id$group)
  fun1=function(group_input){
    return_matrix=input_matrix[,which(grepl(group_input,colnames(input_matrix)))]
    return_matrix$gene_symbol=input_matrix$gene
    return_matrix$ENSEMBL_id=input_matrix$ENSEMBL
    write.table(return_matrix,file = paste0('results/',GEO_id,'_',group_input,'_',platform_id,'_matrix.txt'),sep = '\t',quote = F)
    # write.table(return_matrix,file = paste0('results/',GEO_id,'_',group_input,'_',platform_id,'_',extra_information,'_matrix.txt'),sep = '\t',quote = F)
    
  }
  sapply(group_type,fun1)
}
generate1group_matrix(sample2id = sample2id,input_matrix = exp1)




