library(dplyr)
library(DT)
options(stringsAsFactors = FALSE)
Read and map labeledin
labin.df <-
# read labeledin data
file.path('labeledin', 'data', 'indications.tsv') %>%
read.delim() %>%
# remove combo drugs
dplyr::mutate(rxnorm_id = as.integer(rxnorm_id)) %>%
dplyr::filter(! is.na(rxnorm_id)) %>%
# map umls diseases to DO
dplyr::inner_join(umls.df) %>%
# map rxnorm compounds to drugbank
dplyr::inner_join(rxnorm.df)
## Warning in mutate_impl(.data, dots): NAs introduced by coercion
## Joining by: "disease_cui"
## Joining by: "rxnorm_id"
Read and map MEDI
medi.df <-
file.path('medi', 'data', 'medi-umls.tsv') %>%
read.delim() %>%
dplyr::inner_join(rxnorm.df)
## Joining by: "rxnorm_id"
medi.df <- dplyr::bind_rows(
umls.df %>%
dplyr::inner_join(medi.df),
icd9.df %>%
dplyr::inner_join(medi.df)
)
## Joining by: "disease_cui"
## Joining by: "disease_icd9"
Read and map PREDICT
predict.df <-
file.path('msb-predict', 'data', 'indications-umls.tsv') %>%
read.delim() %>%
dplyr::rename(disease_cui = umls_cui, disease_omim = omim_id)
predict.df <- dplyr::bind_rows(
umls.df %>%
dplyr::inner_join(predict.df),
omim.df %>%
dplyr::inner_join(predict.df)
)
## Joining by: "disease_cui"
## Joining by: "disease_omim"
Read and map ehrlink
ehrlink.df <-
file.path('ehrlink', 'data', 'indications.tsv') %>%
read.delim()
ehrlink.df <- ehrlink.df %>%
dplyr::rename(subsumed_id = doid_code, subsumed_name = doid_name) %>%
dplyr::inner_join(
doslim.df %>% dplyr::transmute(subsumed_id, doid_code = slim_id, doid_name = slim_name)
) %>%
dplyr::inner_join(rxnorm.df %>% dplyr::rename(ingredient_rxcui = rxnorm_id))
## Joining by: "subsumed_id"
## Joining by: "ingredient_rxcui"
Join resources
indication.df <- dplyr::bind_rows(
# LabeledIn
labin.df %>%
dplyr::select(doid_code, drugbank_id) %>%
dplyr::distinct() %>%
dplyr::mutate(resource = 'labeledin'),
# MEDI
medi.df %>%
dplyr::group_by(doid_code, drugbank_id) %>%
dplyr::summarize(
resource = ifelse(max(hps), 'medi_hps', 'medi_lps')
) %>%
dplyr::ungroup(),
# PREDICT
predict.df %>%
dplyr::select(doid_code, drugbank_id) %>%
dplyr::distinct() %>%
dplyr::mutate(resource = 'predict'),
# ehrlink
ehrlink.df %>%
dplyr::select(doid_code, drugbank_id) %>%
dplyr::distinct() %>%
dplyr::mutate(resource = 'ehrlink')
)
# add compound and disease names
indication.df <- indication.df %>%
dplyr::inner_join(drugbank.df) %>%
dplyr::left_join(do.df)
## Joining by: "drugbank_id"
## Joining by: "doid_code"
# save sourced indications
path <- file.path('data', 'indications-with-source.tsv')
write.table(indication.df, path, sep='\t', row.names=FALSE, quote=FALSE)
indication.df %>% DT::datatable(rownames=F)
Convert to a single row per compound-disease pair
hc.sources <- c('medi_hps', 'ehrlink', 'labeledin', 'predict')
lc.sources <- c('medi_lps')
indication_no_source.df <- indication.df %>%
dplyr::group_by(doid_code, drugbank_id, doid_name, drugbank_name) %>%
dplyr::summarize(
n_hc_resources = length(intersect(resource, hc.sources)),
n_lc_resources = length(intersect(resource, lc.sources))
) %>%
dplyr::ungroup() %>%
dplyr::mutate(confidence = ifelse(n_hc_resources > 0, 'high', 'low'))
path <- file.path('data', 'indications.tsv')
write.table(indication_no_source.df, path, sep='\t', row.names=FALSE, quote=FALSE)
indication_no_source.df %>% DT::datatable(rownames=F)
Indications per disease
indication_no_source.df %>%
dplyr::group_by(doid_code, doid_name) %>%
dplyr::summarize(
'n_hc' = sum(confidence == 'high'),
'n_lc' = sum(confidence == 'low')) %>%
DT::datatable(rownames=F)
Indications per compound
indication_no_source.df %>%
dplyr::group_by(drugbank_id, drugbank_name) %>%
dplyr::summarize(
'n_hc' = sum(confidence == 'high'),
'n_lc' = sum(confidence == 'low')) %>%
DT::datatable(rownames=F)