Methods
Gene Fusions identification and differential expression analysis
We employed two Gene Fusions(GFs) detection tools, including arriba, STAR-Fusion, to identify, parse, and annotate GFs junctions within each sample, using the human genome version hg38 as the reference. In our pursuit of biologically robust GFs identification within each cohort, we undertook an approach metafusion that amalgamated results from two GFs detection tools. Firstly, we exclusively retained Fusions located on chromosomes 1 to 22, as well as the X and Y chromosomes. Secondly, GFs were mandated to exhibit genomic overlap with gene regions as defined in the reference genome annotation file, specifically “gencode.v34.annotation.gtf.”. Thirdly to ensure the precision and recall in GFs prediction with different tools, one can either rely on consensus predictions or all predctions and choosing junction-crossing reads and reads mates which flank (span) the breakpoint. Fourthly, we offer FusionAnnotator to identify and prioritize GFs that have been previously reported in cancer or normal samples. Lastly, the detected data can be divided into Cis-SAGes and other categories to further explore the debate on what should be considered true fusion RNAs. This meticulous methodological approach was instrumental in ensuring the acquisition of biologically sound and reliable GFs datasets, thereby fortifying the integrity of our analysis within each cohort. This analysis facilitated comparisons between patient groups, such as those who responded to checkpoint immunotherapy versus non-responders, as well as between samples collected before and during ICB treatment.
Response and Response2
We gathered and processed patient ‘Response’ data from various cohorts. However, due to inconsistencies among these cohorts, such as the usage of different terms like CR, PR, SD, PD, R, NR, DCB, and NDB, we harmonized the response variable based on the timing of treatment and treatment response. As a result, we introduced a new variable called ‘Response2’ with four distinct categories: Pre-R, Pre-NR, On-R, and On-NR. In cases where response data was not available, we assigned the category NE.
Cancer Type Abbreviation
Abbr. | Description |
---|---|
BLCA | Bladder urothelial carcinoma |
SKCM | Skin cutaneous melanoma |
KIRC & CCRCC | Kidney renal clear cell carcinoma |
HNSC & HNSCC | Head and neck squamous cell carcinoma |
NSCLC | Non-small cell lung cancer, including lung adenocarcinoma and lung squamous cell carcinoma |
SCLC | Small cell lung cancer |
SARC | Sarcoma |
BRCA | Breast cancer |
GBM | Glioblastoma Multiforme |
STAD | Stomach Adenocarcinoma |
SGC | Salivary gland cancer |
LUSC | Lung Squamous Cell Carcinoma |
MESO | Mesothelioma |
PCPG | Pheochromocytoma and Paraganglioma |
LIHC | Liver Hepatocellular Carcinoma |
PRAD | Prostate Adenocarcinoma |
ACC | Adrenocortical Carcinoma |
TGCT | Testicular Germ Cell Tumors |
PAAD | Pancreatic Adenocarcinoma |
UCEC | Uterine Corpus Endometrial Carcinoma |
THCA | Thyroid carcinoma |
CESC | Cervical squamous cell carcinoma and endocervical adenocarcinoma |
ESCA | Esophageal carcinoma |
READ | Rectum adenocarcinoma |
OV | Ovarian Serous Cystadenocarcinoma |
CHOL | Cholangiocarcinoma |
UCS | Uterine Carcinosarcoma |
LUAD | Lung Adenocarcinoma |
LGG | Brain Lower Grade Glioma |
HGSC | High-Grade Serous Carcinoma |
LSCC | Lung Squamous Cell Carcinoma |
PDAC | Pancreatic Ductal Adenocarcinoma |
THYM | Thymoma |
KICH | Kidney Chromophobe |
KIRP | Kidney Renal Papillary Cell Carcinoma |
COAD | Colon Adenocarcinoma |
LAML & AML | Acute Myeloid Leukemia |
DLBC | Lymphoid Neoplasm Diffuse Large B-cell Lymphoma |
ALL | Acute lymphocytic leukemia |
AML-IF | Acute Myeloid Leukemia, Induction Failure Subproject |
NBL | Neuroblastoma |
OS | Osteosarcoma |
ES | Ewing sarcoma |
WT | Wilms tumor |
CCSK | Clear Cell Sarcoma of the Kidney |
RT | Kidney, Rhabdoid Tumor |
CCSK | CNS, ependymoma |
MRT | CNS, rhabdoid tumor |
CCSK | CNS, medulloblastoma |
CCSK | NHL, anaplastic large cell lymphoma |
CCSK | NHL, Burkitt lymphoma (BL) |
CCSK | Rhabdomyosarcoma |
CCSK | Soft tissue sarcoma, non-rhabdomyosarcoma |
CCSK | CNS, other |
B-ALL | Precursor B-cell lymphoblastic leukemia |
GNB | Ganglioneuroblastoma |
Clinical data preprocessing
TCGA
library(data.table)
library(dplyr)
# load("/home/data2/Projects/Fusion/unclean/TCGA_ALL_clinical.rdata")
#
# TCGA_ALL_clinical |>
# rename(Patient_ID = bcr_patient_barcode,
# Age = age_at_initial_pathologic_diagnosis,
# Sex = gender,
# Race = race,
# )
# [Discrepancy] [Not Applicable] [Not Available]
# [Discrepancy] [Not Available] [Unknown]
# # [Discrepancy] [Not Applicable]
# 29 105
# [Not Available] [Not Evaluated]
# 5140 4
# [Unknown]
library(UCSCXenaShiny)
table(tcga_clinical_fine$Stage_ajcc)
tcga_clinical
tcga_clinical_fine
tcga_genome_instability
tcga_purity
tcga_subtypes
tcga_surv
nrow(tcga_clinical_fine)
nrow(unique(tcga_clinical_fine))
= unique(tcga_clinical_fine) |>
tcga_clinical left_join(tcga_surv, by = c("Sample"="sample")) |>
left_join(tcga_purity, by = c("Sample"="sample"))
551, ]$Sample
tcga_clinical[
sum(startsWith(tcga_genome_instability$sample, "TCGA"))
= tcga_clinical |>
tcga_clinical left_join(tcga_genome_instability[startsWith(tcga_genome_instability$sample, "TCGA") & !duplicated(tcga_genome_instability$sample), ], by = c("Sample"="sample")) |>
#left_join(tcga_genome_instability[duplicated(tcga_genome_instability$sample), ], by = c("Sample"="sample")) |>
left_join(tcga_subtypes, by = c("Sample"="sampleID"))
sum(is.na(tcga_clinical$Sample))
sum(duplicated(tcga_clinical$Sample))
duplicated(tcga_clinical$Sample), ]$Sample
tcga_clinical[
= tcga_clinical[!duplicated(tcga_clinical$Sample), ] |>
tcga_clinical.final rename(Sample_ID = Sample, Sex = Gender, OS_Time = OS.time, OS_Status = OS, DSS_Time = DSS.time, DSS_Status = DSS, DFS_Time = DFI.time, DFS_Status = DFI, PFS_Time = PFI.time, PFS_Status = PFI) |>
select(-cancer_type) |>
mutate(Sex = ifelse(Sex == "FEMALE", "F", "M"),
Patient_ID = substr(Sample_ID, 1, 12))
sum(is.na(tcga_clinical.final$Sample_ID))
sum(is.na(tcga_clinical.final$Cancer))
sum(duplicated(tcga_clinical.final$Sample_ID))
= tcga_clinical.final |>
tcga_clinical.final select(Patient_ID, Sample_ID, everything()) |>
as.data.table()
tcga_clinical.final
fwrite(tcga_clinical.final, file = "/home/data2/Projects/Fusion/fusiondb/Clininfo/TCGA_info.tsv", sep = "\t")
CPTAC
library(data.table)
# Clinical data from CPTAC paper ------------------------------------------
= fs::dir_ls("CPTAC_Clinical_meta_data_v1/")
cli_list
= purrr::map(cli_list, fread) |> rbindlist(fill = TRUE, use.names = TRUE)
cli colnames(cli)
# 问题,这个第 2 行要跳过才行
# 另外这个同时包含 CPTAC-2/3 的样本,但总数少一些
= purrr::map(cli_list, function(x) {
cli2 # 由于列名也不同,这里读两次,一次读 colname,一次读数据
= fread(x)
data1 = fread(x, skip = 2, header = FALSE)
data2 # 更新列名
colnames(data2) = colnames(data1)
$Cancer = stringr::str_remove(basename(x), "_meta.txt")
data2|> as.data.table()
data2 |> rbindlist(fill = TRUE, use.names = TRUE)
}) #colnames(cli2) = colnames(cli)
colnames(cli2)
== "Yes" & Normal == "Yes"] # 这个是标记患者是否同时 tumor / normal sample
cli2[Tumor
= fs::dir_ls("meta_data_BCM/")
cli_list2
= purrr::map(cli_list2, function(x) {
cli3 # 由于列名也不同,这里读两次,一次读 colname,一次读数据
= fread(x)
data1 = fread(x, skip = 2, header = FALSE)
data2 # 更新列名
colnames(data2) = colnames(data1)
$Cancer = stringr::str_remove(basename(x), "_meta.txt")
data2|> as.data.table()
data2 |> rbindlist(fill = TRUE, use.names = TRUE)
}) colnames(cli3)
all.equal(cli2, cli3)
all.equal(sort(colnames(cli2)), sort(colnames(cli3)))
= cli3[, colnames(cli2), with = FALSE]
cli4 all.equal(cli2, cli4)
identical(cli2, cli4)
::setdiff(cli2, cli4) |> View()
dplyr::setdiff(cli4, cli2) |> nrow()
dplyr
# rbind(cli2[idx == "01OV007"],
# cli4[idx == "01OV007"]) |> View()
# 2 个数据源目录的文件基本没有区别
= cli2 |>
cli.final ::rename(
dplyrPatient_ID = idx,
OS_Status = OS_event,
OS_Time = OS_days,
PFS_Status = PFS_event,
PFS_Time = PFS_days
|>
) ::mutate(
dplyrSample_ID = Patient_ID,
Sex = ifelse(Sex == "Female", "F", "M")
|>
) ::select(
dplyr::everything()
Patient_ID, Sample_ID, Cancer, Age, Sex, OS_Time, OS_Status, PFS_Time, PFS_Status, dplyr
)# dplyr::select(
# - dplyr::starts_with("CIBERSORT"), - dplyr::starts_with("xCell")
# )
fwrite(cli.final, file = "CPTAC_cli_paper_combined.tsv", sep = "\t")
# Clinical data from GDC --------------------------------------------------
system("
cd /home/data2/Projects/Fusion/unclean/gdc_cli/CPTAC
mkdir CPTAC-3 CPTAC-2
tar zxvf clinical.project-cptac-2.2024-08-07.tar.gz -C CPTAC-2
tar zxvf clinical.project-cptac-3.2024-08-07.tar.gz -C CPTAC-3
")
= fread("gdc_cli/CPTAC/CPTAC-2/clinical.tsv")
cptac2 = fread("gdc_cli/CPTAC/CPTAC-3/clinical.tsv")
cptac3
which(colnames(cptac2) %in% "residual_disease")
which(colnames(cptac3) %in% "residual_disease")
all.equal(cptac2[[119]], cptac2[[192]])
all.equal(cptac3[[119]], cptac3[[192]])
192]] = NULL
cptac2[[192]] = NULL
cptac3[[
setdiff(cptac2$case_submitter_id, cli.final$Sample_ID)
setdiff(cptac3$case_submitter_id, cli.final$Sample_ID)
# > setdiff(cptac2$case_submitter_id, cli.final$Sample_ID)
# [1] "01OV049" "26OV010" "100004028" "05BR058" "05BR055" "17OV019" "11BR069" "02OV042" "100004012" "05BR051" "03BR012" "02OV045" "04OV041"
# [14] "05BR052" "05BR031" "1488" "11OV009" "02OV040" "01OV002" "100002921" "22OV001" "13OV004" "100003304" "02OV035" "01OV046" "01OV045"
# [27] "17OV034"
# > setdiff(cptac3$case_submitter_id, cli.final$Sample_ID)
# [1] "C3N-02672" "C3L-01929" "C3N-02682" "C3N-02012" "C3N-02070"
# [6] "C3N-02696" "C3L-02354" "C3N-03026" "C3L-01739" "C3N-02088"
# [11] "C3L-04213" "C3N-03205" "C3L-04081" "C3N-03791" "C3L-03679"
# [16] "C3L-02643" "C3N-04686" "C3L-01558" "C3L-03268" "C3L-01633"
# [21] "C3L-02220" "C3N-02439" "C3N-02028" "C3N-03800" "C3L-04392"
# [26] "C3L-02556" "C3N-02721" "C3L-01672" "C3L-00938" "C3L-02201"
# [31] "C3N-02146" "C3N-02639" "C3N-03788" "C3L-03733" "C3L-04037"
# [36] "C3N-01091" "C3L-02747" "C3N-01879" "C3N-02996" "C3N-02296"
# [41] "C3L-02553" "C3L-03632" "C3N-03755" "GTEX-NPJ7-0011-R10A-SM-HAKXW" "C3L-02746"
length(setdiff(cptac3$case_submitter_id, cli.final$Sample_ID)) # 586
View(cptac2[case_submitter_id %in% setdiff(cptac2$case_submitter_id, cli.final$Sample_ID)])
View(cptac3[case_submitter_id %in% setdiff(cptac3$case_submitter_id, cli.final$Sample_ID)])
::fwrite(
data.table%in% setdiff(cptac3$case_submitter_id, cli.final$Sample_ID)],
cptac3[case_submitter_id file = "CPTAC3_non_included.tsv", sep = "\t"
)
cptac3
# 根据 GDC 已有信息补全 paper 缺少的样本信息 ---------------------------------------------
# 可能标记了 cancer type 的列
# primary_diagnosis site_of_resection_or_biopsy tissue_or_organ_of_origin
= rbind(cptac2, cptac3)[, .(case_submitter_id, primary_diagnosis, site_of_resection_or_biopsy, tissue_or_organ_of_origin)] |> unique()
cptac = dplyr::left_join(cptac, cli.final[, .(Sample_ID, Cancer)], by = c("case_submitter_id"="Sample_ID"))
cptac
setdiff(cli.final$Sample_ID, cptac$case_submitter_id)
# 有 29 例不在 GDC 数据中
table(cptac$Cancer, cptac$primary_diagnosis) |> pheatmap::pheatmap()
table(cptac$Cancer, cptac$site_of_resection_or_biopsy) |> pheatmap::pheatmap()
table(cptac$Cancer, cptac$tissue_or_organ_of_origin) |> pheatmap::pheatmap()
table(cptac$Cancer,
paste(
$primary_diagnosis,
cptac$site_of_resection_or_biopsy,
cptac$tissue_or_organ_of_origin, sep = "_"
cptac|> pheatmap::pheatmap(show_colnames = FALSE)
))
# 一个个处理
cat(sub("_meta.txt", "", basename(names(cli_list))))
= sub("_meta.txt", "", basename(names(cli_list)))
typelist # BRCA CCRCC COAD GBM HGSC HNSCC LSCC LUAD PDAC UCEC
for (i in typelist) {
message("check type ", i)
= cptac |> dplyr::filter(Cancer == i)
cptac_part message(" primary_diagnosis")
table(cptac_part$Cancer, cptac_part$primary_diagnosis) |> print()
message(" site_of_resection_or_biopsy")
table(cptac_part$Cancer, cptac_part$site_of_resection_or_biopsy) |> print()
message(" tissue_or_organ_of_origin")
table(cptac_part$Cancer, cptac_part$tissue_or_organ_of_origin) |> print()
}# BRCA (122): tissue_or_organ_of_origin "Breast, NOS"
# CCRCC (103): primary_diagnosis "Renal cell carcinoma, NOS"
# tissue_or_organ_of_origin "Kidney, NOS"
# COAD (106): tissue_or_organ_of_origin "Colon, NOS", "Rectum, NOS"
# GBM (99): primary_diagnosis "Glioblastoma"
# HGSC (87): primary_diagnosis "Serous adenocarcinoma, NOS" AND
# tissue_or_organ_of_origin Fallopian tube Ovary Peritoneum, NOS
# HNSCC (108): 其他情况 primary_diagnosis "Squamous cell carcinoma, NOS"
# Base of tongue, NOS Cheek mucosa Floor of mouth, NOS Gum, NOS
# HNSCC 1 2 18 3
#
# Head, face or neck, NOS Larynx, NOS Lip, NOS Oropharynx, NOS
# HNSCC 2 47 4 4
#
# Overlapping lesion of lip, oral cavity and pharynx Tongue, NOS Tonsil, NOS
# HNSCC
# LSCC (108): primary_diagnosis "Squamous cell carcinoma, NOS" AND
# tissue_or_organ_of_origin "lung" pattern
# LUAD (110): primary_diagnosis "Adenocarcinoma, NOS" AND
# tissue_or_organ_of_origin "lung" pattern
# PDAC (105): tissue_or_organ_of_origin "pancreas" pattern
# UCEC (95): primary_diagnosis "Endometrioid adenocarcinoma, NOS"
sum(!is.na(cptac$Cancer))
# rbind(cptac2, cptac3)[,192] # 有重复列,需要回到前面处理 (fixed)
# 实际测试和更新
= rbind(cptac2, cptac3) |>
cptac_updated unique() |>
::mutate(
dplyrCancer = dplyr::case_when(
%in% "Breast, NOS" ~ "BRCA",
tissue_or_organ_of_origin %in% "Renal cell carcinoma, NOS" ~ "CCRCC",
primary_diagnosis %in% c("Colon, NOS", "Rectum, NOS") ~ "COAD",
tissue_or_organ_of_origin grepl("Glioblastoma", primary_diagnosis, ignore.case = TRUE) | grepl("Oligodendroglioma", primary_diagnosis, ignore.case = TRUE) ~ "GBM",
grepl("GTEX", case_submitter_id, ignore.case = TRUE) ~ "GTEX-Brain",
%in% c("C3L-06912", "C3L-07212") ~ "GBM",
case_submitter_id #primary_diagnosis %in% "Glioblastoma" ~ "GBM",
%in% "Serous adenocarcinoma, NOS" & tissue_or_organ_of_origin %in% c("Fallopian tube", "Ovary", "Peritoneum, NOS") ~ "HGSC",
primary_diagnosis %in% "Squamous cell carcinoma, NOS" & grepl("lung", tissue_or_organ_of_origin, ignore.case = TRUE) ~ "LSCC",
primary_diagnosis %in% "Adenocarcinoma, NOS" & grepl("lung", tissue_or_organ_of_origin, ignore.case = TRUE) ~ "LUAD",
primary_diagnosis grepl("pancreas", tissue_or_organ_of_origin, ignore.case = TRUE) ~ "PDAC",
%in% "Endometrioid adenocarcinoma, NOS" ~ "UCEC",
primary_diagnosis TRUE ~ "HNSCC"
)|> as.data.table()
)
for (i in "HNSCC") { # typelist
message("check type ", i)
= cptac_updated |> dplyr::filter(Cancer == i)
cptac_part message(" primary_diagnosis")
table(cptac_part$Cancer, cptac_part$primary_diagnosis) |> print()
message(" site_of_resection_or_biopsy")
table(cptac_part$Cancer, cptac_part$site_of_resection_or_biopsy) |> print()
message(" tissue_or_organ_of_origin")
table(cptac_part$Cancer, cptac_part$tissue_or_organ_of_origin) |> print()
}
table(cptac$Cancer)
table(cptac_updated$Cancer)
|>
cptac_updated ::filter(
dplyr== "'--" | site_of_resection_or_biopsy == "'--" |
primary_diagnosis %in% c("'--", "Unknown")
tissue_or_organ_of_origin
)# 1-9 from GTEX-Brain
# C3L-06912 C3L-07212 Gliomas # https://portal.gdc.cancer.gov/cases/e6027623-a5b2-4d82-a9ca-f37c948dd3ed
$case_submitter_id[grepl("GTEX", cptac_updated$case_submitter_id)]
cptac_updated
nrow(cptac_updated)
fwrite(cptac_updated, file = "CPTAC_combined_with_cancer_annotated.tsv", sep = "\t")
= cptac_updated[! case_submitter_id %in% cli.final$Sample_ID]
cptac_unmatched colnames(cptac_unmatched)
colnames(cli.final)[1:20]
= cptac_unmatched |>
cptac_unmatched2 ::select(
dplyr
case_submitter_id, age_at_diagnosis, gender,
days_to_death, vital_status,::starts_with("ajcc"),
dplyr::starts_with("days"),
dplyr
last_known_disease_status, progression_or_recurrence, residual_disease,
tumor_grade, Cancer
)
sum(cptac_unmatched2$days_to_last_follow_up != "'--")
sum(cptac_unmatched2$days_to_death != "'--")
sum(cptac_unmatched2$days_to_recurrence != "'--")
sum(cptac_unmatched2$days_to_last_known_disease_status != "'--")
sum(cptac_unmatched2$progression_or_recurrence != "'--")
sum(!is.na(cli.final$OS_Time))
sum(!is.na(cli.final$PFS_Time))
sum(cli.final$OS_Time >= cli.final$PFS_Time, na.rm = TRUE)
$OS_Time < cli.final$PFS_Time] |> View()
cli.final[cli.final
sum(as.integer(cptac_unmatched2$days_to_last_follow_up) >= as.integer(cptac_unmatched2$days_to_last_known_disease_status), na.rm = TRUE)
= cptac_unmatched2 |>
cptac_unmatched3 ::rename(
dplyrSample_ID = case_submitter_id,
Age = age_at_diagnosis,
Sex = gender,
OS_Time = days_to_last_follow_up,
OS_Status = vital_status,
PFS_Time = days_to_last_known_disease_status,
PFS_Status = progression_or_recurrence,
Histologic_Grade = tumor_grade,
Path_Stage_pT = ajcc_pathologic_t,
Path_Stage_pN = ajcc_pathologic_n,
Stage = ajcc_pathologic_stage
|>
) ::select(
dplyr
Sample_ID, Age, Sex, OS_Time, OS_Status, PFS_Time, PFS_Status,:Path_Stage_pT,
Path_Stage_pN
Histologic_Grade, Cancer|>
) ::mutate(
dplyrPatient_ID = Sample_ID,
Age = as.integer(round(as.integer(Age) / 365)),
Sex = fcase(Sex %in% "female", "F",
%in% "male", "M", default = NA_character_),
Sex OS_Time = as.integer(OS_Time),
OS_Status = fcase(OS_Status %in% "Alive", 0L,
%in% "Dead", 1L, default = NA_integer_),
OS_Status PFS_Time = as.integer(PFS_Time),
PFS_Status = fcase(PFS_Status %in% "no", 0L,
%in% "yes", 1L, default = NA_integer_),
PFS_Status OS_Time = ifelse(OS_Time < 0, NA_integer_, OS_Time),
Path_Stage_pN = fcase(
%in% "N0", "pN0",
Path_Stage_pN %in% c("N1", "N1a"), "pN1",
Path_Stage_pN %in% c("N2", "N2a"), "pN2",
Path_Stage_pN default = NA_character_
),Path_Stage_pT = fcase(
%in% "T0", "pT0",
Path_Stage_pT %in% c("T1", "T1a", "T1b", "T1c"), "pT1",
Path_Stage_pT %in% c("T2", "T2a", "T2b"), "pT2",
Path_Stage_pT %in% c("T3", "T3a", "T3b"), "pT3",
Path_Stage_pT %in% "T4", "pT4",
Path_Stage_pT default = NA_character_
),Stage = fcase(
%in% c("Stage I", "Stage IA", "Stage IA1", "Stage IA2", "Stage IA3", "Stage IB"), "Stage I",
Stage %in% c("Stage II", "Stage IIA", "Stage IIB"), "Stage II",
Stage %in% c("Stage III", "Stage IIIA", "Stage IIIB", "Stage IIIC", "Stage IIIC1", "Stage IIIC2"), "Stage III",
Stage %in% c("Stage IV", "Stage IVB"), "Stage IV",
Stage default = NA_character_
),Histologic_Grade = fcase(
%in% "G1", "G1 Well differentiated",
Histologic_Grade %in% "G2", "G2 Moderately differentiated",
Histologic_Grade %in% "G3", "G3 Poorly differentiated",
Histologic_Grade %in% "G4", "G4 Undifferentiated",
Histologic_Grade default = NA_character_
)
)
cptac_unmatched3
# X: cannot be assessed
table(cptac_unmatched3$Path_Stage_pN)
table(cli.final$Path_Stage_pN)
table(cptac_unmatched3$Path_Stage_pT)
table(cli.final$Path_Stage_pT)
table(cptac_unmatched3$Stage)
table(cli.final$Stage)
table(cptac_unmatched3$Histologic_Grade)
table(cli.final$Histologic_Grade)
# Final output ------------------------------------------------------------
.2 = dplyr::bind_rows(
cli.final
cli.final, cptac_unmatched3|> as.data.table()
)
table(cli.final.2$Cancer)
sum(table(cli.final.2$Cancer))
# Race 全部加进去?
.2 = dplyr::left_join(
cli.final.2, unique(rbind(cptac2, cptac3)[, .(case_submitter_id, race)]),
cli.finalby = c("Sample_ID"="case_submitter_id")
|>
) ::mutate(
dplyrrace = ifelse(race %in% c("Unknown", "not reported"), NA_character_,
race)|>
) as.data.table()
sum(is.na(cli.final.2$race))
fwrite(cli.final.2, file = "/home/data2/Projects/Fusion/fusiondb/Clininfo/CPTAC_info.tsv", sep = "\t")
TARGET
library(data.table)
= fread("TARGET_donor_allprojects_transfer_to_sample.gz")
cli1 = fread("TARGET_phenotype.gz")
cli2
= cli2 |>
target_cli ::select(-`_cohort`) |>
dplyr::rename(Patient_ID = `_PATIENT`,
dplyrSample_ID = sample_id,
disease_code = primary_disease_code,
disease = `_primary_disease`,
sample_type = `_sample_type`) |>
::full_join(
dplyr|> dplyr::select(
cli1 -`_OS_UNIT`, -`_EVENT`, -`_TIME_TO_EVENT`
|>
) ::rename(
dplyrSample_ID = xena_sample,
Sex = `_gender`,
Age = `_age_at_diagnosis`,
OS_Status = `_OS_IND`,
OS_Time = `_OS`
by = "Sample_ID"
), |>
) ::mutate(
dplyrSex = ifelse(Sex == "Male", "M", "F"),
OS_Status = ifelse(OS_Status == "Alive", 0L, 1L)
|>
) ::select(Patient_ID, Sample_ID, Age, Sex, OS_Time, OS_Status, dplyr::everything()) |>
dplyras.data.table()
cli1
target_cli
fwrite(target_cli, file = "/home/data2/Projects/Fusion/fusiondb/Clininfo/TARGET_xena_info.tsv", sep = "\t")
# Using cli from GDC ------------------------------------------------------
system("
cd /home/data2/Projects/Fusion/unclean/gdc_cli/TARGET
mkdir ALL_P1 ALL_P2 ALL_P3 AML CCSK NBL OS RT WT
tar zxvf clinical.project-target-all-p1.2024-08-07.tar.gz -C ALL_P1
tar zxvf clinical.project-target-all-p2.2024-08-07.tar.gz -C ALL_P2
tar zxvf clinical.project-target-all-p3.2024-08-07.tar.gz -C ALL_P3
tar zxvf clinical.project-target-aml.2024-08-07.tar.gz -C AML
tar zxvf clinical.project-target-ccsk.2024-08-07.tar.gz -C CCSK
tar zxvf clinical.project-target-nbl.2024-08-07.tar.gz -C NBL
tar zxvf clinical.project-target-os.2024-08-07.tar.gz -C OS
tar zxvf clinical.project-target-rt.2024-08-07.tar.gz -C RT
tar zxvf clinical.project-target-wt.2024-08-07.tar.gz -C WT
")
= rbindlist(
gdc_target list(
fread("gdc_cli/TARGET/ALL_P1/clinical.tsv"),
fread("gdc_cli/TARGET/ALL_P2/clinical.tsv"),
fread("gdc_cli/TARGET/ALL_P3/clinical.tsv"),
fread("gdc_cli/TARGET/AML/clinical.tsv"),
fread("gdc_cli/TARGET/CCSK/clinical.tsv"),
fread("gdc_cli/TARGET/NBL/clinical.tsv"),
fread("gdc_cli/TARGET/OS/clinical.tsv"),
fread("gdc_cli/TARGET/RT/clinical.tsv"),
fread("gdc_cli/TARGET/WT/clinical.tsv")
),use.names = TRUE, fill = TRUE
)
table(gdc_target$project_id)
any(duplicated(gdc_target))
= gdc_target[!duplicated(gdc_target)]
gdc_target
all.equal(gdc_target[[119]], gdc_target[[192]])
192]] = NULL
gdc_target[[
sum(gdc_target$days_to_last_follow_up != "'--")
sum(gdc_target$days_to_death != "'--")
= gdc_target |>
gdc_target2 ::rename(
dplyrSample_ID = case_submitter_id,
Age = age_at_diagnosis,
Sex = gender,
OS_Time = days_to_death,
OS_Status = vital_status
|>
) ::select(
dplyr
Sample_ID, Age, Sex, OS_Time, OS_Status,
project_id, race, primary_diagnosis|>
) ::mutate(
dplyrPatient_ID = Sample_ID,
Age = as.integer(round(as.integer(Age) / 365)),
Sex = fcase(Sex %in% "female", "F",
%in% "male", "M", default = NA_character_),
Sex OS_Time = as.integer(OS_Time),
OS_Status = fcase(OS_Status %in% "Alive", 0L,
%in% "Dead", 1L, default = NA_integer_),
OS_Status OS_Time = ifelse(OS_Time < 0, NA_integer_, OS_Time)
|>
) as.data.table()
#unique()
# 存在大量 case level 重复
duplicated(gdc_target2), ]
gdc_target2[duplicated(gdc_target[case_submitter_id == "TARGET-20-PANTIV"])
== "TARGET-20-PANTIV"] |> View() # 存在少量记录的更新?
gdc_target[case_submitter_id
= gdc_target2[!duplicated(gdc_target2)]
gdc_target2
#table(gdc_target2$PFS_Time)
any(duplicated(gdc_target2))
any(duplicated(gdc_target2$Sample_ID))
duplicated(gdc_target2$Sample_ID)]
gdc_target2[# 会存在不同 ALL 的 Project 中
= gdc_target2[Sample_ID %in% Sample_ID[duplicated(Sample_ID)]]
gdc_dup table(gdc_dup$project_id)
# TARGET-ALL-P1 TARGET-ALL-P2 TARGET-ALL-P3
# 10 57 47
#
# 这种先不处理,如果 ALL 合并分析考虑去重
colnames(gdc_target2)
= gdc_target2[, c("Patient_ID", "Sample_ID", "Age", "Sex", "race", "primary_diagnosis", "OS_Time", "OS_Status", "project_id")]
gdc_target2
fwrite(gdc_target2, file = "/home/data2/Projects/Fusion/fusiondb/Clininfo/TARGET_info.tsv", sep = "\t")
sum(!is.na(gdc_target2$OS_Time))
sum(!is.na(cli1$`_OS`))
nrow(na.omit(unique(target_cli[, .(Patient_ID, OS_Time, OS_Status)])))
nrow(na.omit(unique(gdc_target2[, .(Patient_ID, OS_Time, OS_Status)])))