remove(list = ls())
setwd('H:/project')
library(GenomicFeatures)
library(methylumi)
library(lumi)
library(limma)

## 首先是从illumina的芯片结果文件，自己用R的lumi包来获取表达矩阵。
GEO_id='GSE137438'
platform_id='GPL10558'
# tiss_type='other_blood_cell'
tiss_type='PBMC_from_Peripheral_blood'
outliers=NA
useGEO_file=F
doQualitycontrol=T
first_time_for_pdata=F
platform_seperate_relationship_file=T
platform_seperate_pdata=F
use_extra=F
extra_information='CD8'
threshold=0.25
if(use_extra==F){
  fileName=paste0('results/lumi_accessible_raw_data/',GEO_id,'_',platform_id,'_lumi_accessible_raw_1.txt')
}else{
  fileName=paste0('results/lumi_accessible_raw_data/',GEO_id,'_',platform_id,'_lumi_accessible_raw_',extra_information,'.txt')
}
file.raw=read.table(fileName,sep = '\t',header = T,fill = T,quote = "")
colnames(file.raw)[which(grepl(pattern = 'ID|ID_REF',colnames(file.raw)))]='PROBE_ID'
print(any(duplicated(file.raw$PROBE_ID)))
file.raw=file.raw[!duplicated(file.raw$PROBE_ID),]
rownames(file.raw)=file.raw$PROBE_ID
file.raw=file.raw[,-1]

#探针过滤
#根据原始数据筛选p值低的探针
#这里需要注意判断原始数据的p值是哪个版本的软件生成的，用箱线图可以简单看出，如果大部分探针p值都很小表明p值是越小越好的情况即为较新版本软件的结果，否则即为旧版本软件结果，需要对全部p值进行变换
probe_list=file.raw[,which(grepl('Detection',colnames(file.raw)))]
#probe_list=file.raw[,which(grepl('pvalue',colnames(file.raw)))]
probe_number_all=nrow(probe_list)

probe_list=as.data.frame(probe_list)
probe_list=sapply(probe_list,as.numeric)
rownames(probe_list)=rownames(file.raw)
boxplot(probe_list,names=F)
# probe_list=1-probe_list
median_rawdata=median(apply(probe_list,2,median))
probe_keep=rownames(probe_list)[which(rowSums(probe_list<0.01)>=(threshold*ncol(probe_list)))] #阈值可以调整

probe_keep_number=length(probe_keep)
#查看被筛选掉的探针
probe_filt=rownames(probe_list)[which(!rowSums(probe_list<0.01)>=(threshold*ncol(probe_list)))] #阈值可以调整

probe_filt_number=length(probe_filt)

proportion_filter=probe_filt_number/probe_number_all

filter_report=data.frame(number_of_all_probes=probe_number_all,number_of_probe_keep=probe_keep_number,number_of_probe_filt=probe_filt_number,proportion_filter=proportion_filter)
rownames(filter_report)=GEO_id
if(use_extra==T){
  write.table(filter_report,file=paste0('results/',GEO_id,'_',platform_id,'_',extra_information,'_filter_report.txt'),sep = '\t',row.names = T,quote = F)
  
}else{
  write.table(filter_report,file=paste0('results/',GEO_id,'_',platform_id,'_filter_report.txt'),sep = '\t',row.names = T,quote = F)
  
}
#读取原始数据
setwd('H:/project')
data <- read.ilmn(fileName,probeid = 'ID_REF',other.columns = 'Detection.Pval') 
data.neqc=neqc(data,detection.p = 'Detection.Pval')
# x.lumi.1<- lumiR.batch(fileName,sep = '\t',QC = F,detectionTh = 0.05) 

x.lumi <- lumiR.batch(fileName,sep = '\t')

#质量控制
if(doQualitycontrol==T){
  #质量控制部分
  library(arrayQualityMetrics)
  data.qc=prepdata(expressionset = x.lumi,intgroup = c(),do.logtransform = T)
  bo=aqm.boxplot(data.qc)
  bo_stat=bo@outliers@statistic
  write.table(as.data.frame(bo_stat),file = paste0('results/',GEO_id,'_',platform_id,'_qc_boxplot_Ka_values.txt'),sep = '\t')
  hp=aqm.heatmap(data.qc)
  hp_stat=hp@outliers@statistic
  write.table(as.data.frame(bo_stat),file = paste0('results/',GEO_id,'_',platform_id,'_qc_distance_pheatmap_Sa_values.txt'),sep = '\t')
  aqm.writereport(modules = list(bo,hp),reporttitle = paste0('arrayQualityMetrics report for ',GEO_id,'_',platform_id),outdir = paste0('results/',GEO_id,'_',platform_id,'_QCreport'),arrayTable = data.qc$pData)
  # arrayQualityMetrics(expressionset = data.raw,
  #                     outdir = paste0('results/',GEO_id,'_QCreport'),
  #                     force = T,reporttitle = paste0('arrayQualityMetrics report for ',GEO_id),spatial = T,do.logtransform = T)
  # if(platform_seperate_relationship_file==T){
  #   data.qc=prepdata(expressionset = x.lumi,intgroup = c(),do.logtransform = T)
  #   bo=aqm.boxplot(data.qc)
  #   bo_stat=bo@outliers@statistic
  #   write.table(as.data.frame(bo_stat),file = paste0('results/',GEO_id,'_',platform_id,'_qc_boxplot_Ka_values.txt'),sep = '\t')
  #   hp=aqm.heatmap(data.qc)
  #   hp_stat=hp@outliers@statistic
  #   write.table(as.data.frame(bo_stat),file = paste0('results/',GEO_id,'_',platform_id,'_qc_distance_pheatmap_Sa_values.txt'),sep = '\t')
  #   aqm.writereport(modules = list(bo,hp),reporttitle = paste0('arrayQualityMetrics report for ',GEO_id,'_',platform_id),outdir = paste0('results/',GEO_id,'_',platform_id,'_QCreport'),arrayTable = data.qc$pData)
  #   
  # }else{
  #   data.qc=prepdata(expressionset = data.lumi,intgroup = c(),do.logtransform = T)
  #   bo=aqm.boxplot(data.qc)
  #   bo_stat=bo@outliers@statistic
  #   write.table(as.data.frame(bo_stat),file = paste0('results/',GEO_id,'_qc_boxplot_Ka_values.txt'),sep = '\t')
  #   hp=aqm.heatmap(data.qc)
  #   hp_stat=hp@outliers@statistic
  #   write.table(as.data.frame(bo_stat),file = paste0('results/',GEO_id,'_qc_distance_pheatmap_Sa_values.txt'),sep = '\t')
  #   aqm.writereport(modules = list(bo,hp),reporttitle = paste0('arrayQualityMetrics report for ',GEO_id),outdir = paste0('results/',GEO_id,'_QCreport'),arrayTable = data.qc$pData)
  #   
  # }
  #获取离群样本
  bo_outliers=names(bo@outliers@which)
  hp_outliers=names(bo@outliers@which)
  # ma_outliers=names(bo@outliers@which)
  outliers=intersect(bo_outliers,hp_outliers)
}


#获取患者临床信息并进行分组
if(first_time_for_pdata==T){
  #用GEOquery包下载GSE数据集的Series_matrix
  
  
  #数据处理：1.获取矩阵后对总体表达量进行矫正
  library(GEOquery)
  library(stringr)
  # library(tidyverse)
  #网速不快才用，本地有文件就不用了
  # library(GEOmirror)
  # eSet=geoChina(GEO_id)
  #如果可以正常下载，就不用镜像了
  #下载文件
  gset=getGEO(filename = paste0('H:/HIV_microarray_data_GEO/',tiss_type,'/',platform_id,'_affymetrix/',GEO_id,'_series_matrix.txt.gz'),destdir = '.',GSEMatrix = T,AnnotGPL = F,getGPL = F)
  #查看pdata数据框的分组信息情况
  pdata=pData(gset[1])
  
  table(pdata$source_name_ch1)
  #如果series_matrix未提供信息，需要自己设置分组
  #手动分组 不推荐使用，只在分组不明确时用
  #读取分组文件
  group_table=read.table(paste0('data/',GEO_id,'_group.txt'),sep = '\t',header = T)
  pdata$sample_id=rownames(pdata)
  pdata=merge(pdata,group_table,by='sample_id')
  write.table(pdata,file = paste0('results/',GEO_id,'_pdata.txt'),sep = '\t',quote = F)
  sample2id=pdata[,which(colnames(pdata) %in% c('geo_accession','group'))]
  #根据分组列信息分配组名
  pdata=pdata[pdata$source_name_ch1=='PBMC samples at week 0',]
  pdata$group='HIV_ART'
  sample2id=pdata[,which(colnames(pdata) %in% c('geo_accession','group'))]
  write.table(pdata,file = paste0('results/',GEO_id,'_pdata.txt'),sep = '\t',quote = F)
  #强制指定每种方法的分组
  pdata=pdata %>%
    # filter(column!='c') %>%
    #mutate(group=ifelse(title=='Patient 1','HIV_nonART',ifelse(title=='Patient 2','HIV_ART',ifelse(title=='Patient 3','HIV_ART',NA))))
    mutate(group=ifelse(grepl('CP',title),'HIV_nonART',ifelse(grepl('EC',title) ,'HIV_EC',NA)))
  setwd('H:/project')
  write.table(pdata,file = paste0('results/',GEO_id,'_pdata.txt'),sep = '\t',quote = F)
  sample2id=pdata[,which(colnames(pdata) %in% c('geo_accession','group'))]
  write.table(exp1,file = paste0('results/',GEO_id,'_expression_matrix_all_group.txt'),sep = '\t',quote = F)
}else{
  if(platform_seperate_pdata==T){
    pdata=read.table(paste0('H:/project/results/',tiss_type,'/illumina/matrix/',GEO_id,'_',platform_id,'_pdata.txt'),sep = '\t',fill = T,header = F,row.names = NULL)#写到这里了
    if(pdata[1,ncol(pdata)]==''){
      pdata[1,]=c('',pdata[1,-length(pdata[1,])])
      colnames(pdata)=pdata[1,]
      pdata=pdata[-1,]
      rownames(pdata)=pdata[,1]
      pdata=pdata[,-1]
    }else{
      stop('Check pdata file, fix it manually if the problem is too complicated!')
    }
  }else{
    pdata=read.table(paste0('H:/project/results/',tiss_type,'/illumina/matrix/',GEO_id,'_pdata.txt'),sep = '\t',fill=T,header = F,row.names = NULL)#写到这里了
    # pdata=read.table(paste0('H:/project/results/',tiss_type,'/illumina/matrix/',GEO_id,'_',extra_information,'_pdata.txt'),sep = '\t',fill = T,header = F,row.names = NULL)#写到这里了
    if(pdata[1,ncol(pdata)]==''){
      pdata[1,]=c('',pdata[1,-length(pdata[1,])])
      colnames(pdata)=pdata[1,]
      pdata=pdata[-1,]
      rownames(pdata)=pdata[,1]
      pdata=pdata[,-1]
    }else{
      stop('Check pdata file, fix it manually if the problem is too complicated!')
    }
  }
  sample2id=pdata[,which(colnames(pdata) %in% c('geo_accession','group'))]
}

data_expr=data.neqc$E
dim(data_expr)

# write.table(data_expr,file = paste0('results/GSE52900_probe_expression_matrix_filtered.txt'),sep = '\t',quote = F)

#根据探针过滤的结果筛选探针
data_expr=data_expr[rownames(data_expr) %in% probe_keep,]
dim(data_expr)
#表达矩阵中去除未通过质控的样本
data_expr=data_expr[,!colnames(data_expr)%in% outliers]
# write.table(data_expr,file = paste0('results/GSE52900_probe_expression_matrix_filtered.txt'),sep = '\t',quote = F)


# data_expr_primitive=data_expr

#调整列名顺序
# data_expr=as.data.frame(data_expr)
# data_expr=data_expr[,order(as.numeric(gsub('Sample ','',names(data_expr))))]

#探针注释
#注：soft文件列名不统一，活学活用，有的GPL平台没有提供注释，如GPL16956
library(GEOquery)
library(stringr)
annoset = getGEO(filename = paste0('H:/project/data/annotation_file/illumina/',platform_id,'.annot.gz'),destdir = ".")
b = annoset@dataTable@table
colnames(b)
ids2 = b[,c("ID","Gene symbol")]#有的数据不一定写Symbol，可能写的是Gene Symbol，如果出现报错，可以在colname(b)里查看到底哪一列是注释
colnames(ids2) = c("probe_id","symbol")
probe_multi=nrow(ids2[str_detect(ids2$symbol,"///"),])
propotion1=probe_multi/nrow(ids2)
ids2 = ids2[ids2$symbol!="" & !str_detect(ids2$symbol,"///"),] #删除没有对应到基因的探针以及涉及重叠基因的探针，这些探针的gene_symbol中有被///隔开的一系列基因名，这些基因之间因为有重叠会被探针识别，为了减少分析的复杂程度，暂时删去这些探针
gene_ids=ids2

#除去表达矩阵中存在但是探针注释集中没有即没有对应基因以及涉及重叠基因的探针，这些探针对应不到任何记录中的基因
data_expr=data_expr[rownames(data_expr) %in% gene_ids$probe_id,]

#更改探针集合中顺序使得顺序与表达矩阵的一致
gene_ids=gene_ids[match(rownames(data_expr),gene_ids$probe_id),] #match函数是返回第一个输入的每个元素在第二个输入中的位置，这里将第二个输入即gene_ids中各行顺序按表达矩阵的行进行了重排

#通过probe_id将同symbol对应的多个探针进行分组
#计算每组探针表达量平均值，取均值最大的探针作为symbol对应的唯一探针
#此处的 by函数 作用是，根据第二个参数将第一个参数分成若干组
tmp = by(data_expr,
         gene_ids$symbol,
         function(x) rownames(x)[which.max(rowMeans(x))]) 
probes = as.character(tmp)
dim(data_expr)
exp1 = data_expr[rownames(data_expr) %in% probes,] # 过滤有多个探针的基因
dim(exp1)
#还可以取探针中中位数最大的探针
# tmp2 = by(exp,
#           gene_ids$symbol,
#           function(x) rownames(x)[which.max(apply(x,1,median))])
# probes2 = as.character(tmp2)
# dim(exp)
# exp = exp[rownames(exp) %in% probes2,] # 过滤有多个探针的基因
# dim(exp)
#这里先使用平均数最大的探针进行分析

rownames(exp1)=gene_ids[match(rownames(exp1),gene_ids$probe_id),2]


#根据genesymbol获取ENSEMBL_id
library(org.Hs.eg.db)
exp1=as.data.frame(exp1)
exp1$gene_symbol=rownames(exp1)
exp1$ENSEMBL_id<-sapply(mapIds(org.Hs.eg.db,
                               keys = exp1$gene_symbol,
                               column = 'ENSEMBL',
                               keytype = 'SYMBOL',
                               multiVals = 'list'),function(x) paste(x,collapse = ','))




#表达矩阵列名后面添加分组信息
dict=setNames(sample2id$group,sample2id$geo_accession)#生成一个表达矩阵列名于accession_id对应关系的字典
colnames(exp1)=paste0(colnames(exp1),'_',dict[colnames(exp1)])
colnames(exp1)[which(colnames(exp1)=='gene_symbol_NA')]='gene_symbol'
colnames(exp1)[which(colnames(exp1)=='ENSEMBL_id_NA')]='ENSEMBL_id'

setwd('H:/project')
if(platform_seperate_relationship_file==T){
  write.table(exp1,file = paste0('results/',GEO_id,'_',platform_id,'_expression_matrix_all_group_using_limma.txt'),sep = '\t',quote = F)
  # write.table(exp1,file = paste0('results/',GEO_id,'_',platform_id,'_',extra_information,'_expression_matrix_all_group.txt'),sep = '\t',quote = F)
  
}else{
  write.table(exp1,file = paste0('results/',GEO_id,'_expression_matrix_all_group.txt'),sep = '\t',quote = F)
  
}
#制作每种分组的表达矩阵

generate1group_matrix=function(sample2id,input_matrix){
  group_type=unique(sample2id$group)
  fun1=function(group_input){
    return_matrix=input_matrix[,which(grepl(group_input,colnames(input_matrix)))]
    return_matrix$gene_symbol=input_matrix$gene
    return_matrix$ENSEMBL_id=input_matrix$ENSEMBL
    write.table(return_matrix,file = paste0('results/',GEO_id,'_',group_input,'_',platform_id,'_matrix.txt'),sep = '\t',quote = F)
    # write.table(return_matrix,file = paste0('results/',GEO_id,'_',group_input,'_',platform_id,'_',extra_information,'_matrix.txt'),sep = '\t',quote = F)
    
  }
  sapply(group_type,fun1)
}
generate1group_matrix(sample2id = sample2id,input_matrix = exp1)
