library(readr)
library(dplyr)
library(DT)
Read and map labeledin
labin_df <-
# read labeledin data
file.path('labeledin', 'data', 'indications.tsv') %>%
readr::read_tsv(col_types = list(rxnorm_id = readr::col_character())) %>%
# removes combo drugs which do not convert to integer
dplyr::mutate(rxnorm_id = as.integer(rxnorm_id)) %>%
dplyr::filter(! is.na(rxnorm_id)) %>%
# map umls diseases to DO
dplyr::inner_join(umls_df) %>%
# map rxnorm compounds to drugbank
dplyr::inner_join(rxnorm_df)
## Warning in mutate_impl(.data, dots): NAs introduced by coercion
## Joining by: "disease_cui"
## Joining by: "rxnorm_id"
Read and map MEDI
medi_df <-
file.path('medi', 'data', 'medi-umls.tsv') %>%
readr::read_tsv() %>%
dplyr::inner_join(rxnorm_df)
## Joining by: "rxnorm_id"
medi_df <- dplyr::bind_rows(
umls_df %>%
dplyr::inner_join(medi_df),
icd9_df %>%
dplyr::inner_join(medi_df)
)
## Joining by: "disease_cui"
## Joining by: "disease_icd9"
Read and map PREDICT
predict_df <-
file.path('msb-predict', 'data', 'indications-umls.tsv') %>%
readr::read_tsv() %>%
dplyr::rename(disease_cui = umls_cui, disease_omim = omim_id)
predict_df <- dplyr::bind_rows(
umls_df %>%
dplyr::inner_join(predict_df),
omim_df %>%
dplyr::inner_join(predict_df)
)
## Joining by: "disease_cui"
## Joining by: "disease_omim"
Read and map ehrlink
ehrlink_df <-
file.path('ehrlink', 'data', 'indications.tsv') %>%
readr::read_tsv() %>%
dplyr::rename(rxnorm_id = ingredient_rxcui) %>%
dplyr::inner_join(rxnorm_df)
## Joining by: "rxnorm_id"
# ehrlink.df <- ehrlink.df %>%
# dplyr::rename(subsumed_id = doid_code, subsumed_name = doid_name) %>%
# dplyr::inner_join(
# doslim.df %>% dplyr::transmute(subsumed_id, doid_code = slim_id, doid_name = slim_name)
# ) %>%
# dplyr::inner_join(rxnorm.df %>% dplyr::rename(ingredient_rxcui = rxnorm_id))
Join resources
indication_df <- dplyr::bind_rows(
# LabeledIn
labin_df %>%
dplyr::select(doid_code, drugbank_id) %>%
dplyr::distinct() %>%
dplyr::mutate(resource = 'labeledin'),
# MEDI
medi_df %>%
dplyr::group_by(doid_code, drugbank_id) %>%
dplyr::summarize(
resource = ifelse(max(hps), 'medi_hps', 'medi_lps')
) %>%
dplyr::ungroup(),
# PREDICT
predict_df %>%
dplyr::select(doid_code, drugbank_id) %>%
dplyr::distinct() %>%
dplyr::mutate(resource = 'predict'),
# ehrlink
ehrlink_df %>%
dplyr::select(doid_code, drugbank_id) %>%
dplyr::distinct() %>%
dplyr::mutate(resource = 'ehrlink')
) %>%
dplyr::arrange(doid_code, drugbank_id, resource)
# add compound and disease names
indication_df <- indication_df %>%
dplyr::inner_join(drugbank_df) %>%
dplyr::left_join(do_df)
## Joining by: "drugbank_id"
## Joining by: "doid_code"
# save
indication_df %>% readr::write_tsv(file.path('data', 'indications-verbose.tsv'))
indication_df %>% DT::datatable(rownames=F)
indication_slim_df <- indication_df %>%
dplyr::filter(drugbank_id %in% drugbank_slim) %>%
dplyr::filter(doid_code %in% do_slim) %>%
dplyr::rename(do_subsumed_id = doid_code, do_subsumed_name = doid_name) %>%
dplyr::inner_join(
doslim_map_df %>%
dplyr::transmute(do_slim_id = slim_id, do_slim_name = slim_name, do_subsumed_id = subsumed_id)
) %>%
dplyr::select(drugbank_id, drugbank_name, do_slim_id, do_slim_name, do_subsumed_id, do_subsumed_name, resource) %>%
dplyr::arrange(drugbank_id, do_slim_id, do_subsumed_id, resource)
## Joining by: "do_subsumed_id"
# save sourced indications
indication_slim_df %>% readr::write_tsv(file.path('data', 'indications-slim-verbose.tsv'))
Convert to a single row per compound-disease pair. Include only high confidence indications
hc_sources <- c('medi_hps', 'ehrlink', 'labeledin', 'predict')
collapsed_df <- indication_slim_df %>%
dplyr::filter(resource %in% hc_sources) %>%
dplyr::rename(doid_id = do_slim_id, doid_name = do_slim_name) %>%
dplyr::group_by(drugbank_id, drugbank_name, doid_id, doid_name) %>%
dplyr::summarize(
distinct_resources = n_distinct(resource),
total_resources = n()
) %>% dplyr::ungroup()
# save
collapsed_df %>% readr::write_tsv(file.path('data', 'indications-slim-collapsed.tsv'))
collapsed_df %>% DT::datatable(rownames=F)
Indications per disease
collapsed_df %>%
dplyr::group_by(doid_id, doid_name) %>%
dplyr::summarize(
'n_indications' = n()
) %>%
DT::datatable(rownames=F)
Indications per compound
collapsed_df %>%
dplyr::group_by(drugbank_id, drugbank_name) %>%
dplyr::summarize(
'n_indications' = n()
) %>%
DT::datatable(rownames=F)
Create a dataset for the curators
curation_df <- collapsed_df %>%
dplyr::transmute(
drug = drugbank_name,
disease = doid_name,
classification = '',
notes = '',
drug_url = paste0('http://www.drugbank.ca/drugs/', drugbank_id),
disease_url = paste0('http://www.disease-ontology.org/term/', sub(':', '%3A', doid_id))
) %>% dplyr::arrange(drug, disease)
curation_df %>% readr::write_tsv(file.path('data', 'curation.tsv'))
set.seed(0)
curation_df %>%
dplyr::sample_n(50) %>%
dplyr::arrange(drug, disease) %>%
readr::write_tsv(file.path('data', 'curation-subset.tsv'))