library(dplyr)
library(DT)
library(ggplot2)

options(stringsAsFactors=FALSE)

Symptoms from the HSDN (Human symptoms–disease network)

# read Disease Ontology to MESH mapping
mesh.df <- 
  #'http://git.dhimmel.com/disease-ontology/data/xrefs-prop-slim.tsv' %>%
  'http://git.dhimmel.com/disease-ontology/data/xrefs-slim.tsv' %>%
  read.delim() %>%
  dplyr::filter(resource == 'MSH') %>%
  dplyr::select(-resource) %>%
  dplyr::rename(disease_id = resource_id)

# read HSDN supplement that was mapped to MESH
hsdn.df <- 
  'https://raw.githubusercontent.com/LABrueggs/HSDN/master/Combined-Output.tsv' %>%
  read.delim(check.names = FALSE, row.names=1) %>%
  dplyr::rename(
    symptom_id = `MeSH Symptom ID`,
    symptom_name = `MeSH Symptom Term`,
    disease_id = `MeSH Disease ID`,
    disease_name = `MeSH Disease Term`,
    tfidf_score = `TFIDF score`,
    cooccurs = `PubMed occurrence`
  )

# add MESH to hsdn.df
hsdn.df <- hsdn.df %>%
  dplyr::inner_join(mesh.df)
## Joining by: "disease_id"
path <- file.path('data', 'symptoms-DO.tsv')
write.table(hsdn.df, path, sep='\t', row.names=FALSE, quote=FALSE)

DT::datatable(hsdn.df %>% dplyr::filter(tfidf_score >= 25))

# Distribution of tfidf_scores
hsdn.df %>%
  ggplot(aes(x = tfidf_score)) +
  geom_histogram(alpha=0.6) +
  scale_x_log10() + theme_bw()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

# Distribution of tfidf_scores (zoomed in)
hsdn.df %>%
  ggplot(aes(x = tfidf_score)) +
  geom_histogram(alpha=0.6, binwidth=1) +
  xlim(c(0, 50)) +
  theme_bw()

# distribution of symptom scores by DO slim disease
hsdn.df %>%
  ggplot(aes(x = doid_name, y = tfidf_score)) +
  geom_violin(fill='darkgreen', color=NA) +
  #geom_jitter(alpha = 0.4) +
  scale_y_log10() +
  theme_bw() +
  coord_flip()

# number of symptoms (and percent with tfidf_score >= 25) per DO slim disease
hsdn.df %>%
  dplyr::group_by(doid_code, doid_name) %>%
  dplyr::summarize(
    n_symptoms = n(),
    number_above_25 = sum(tfidf_score >= 25),
    percent_above_25 = round(100 * mean(tfidf_score >= 25), 3)
    ) %>%
  DT::datatable()