import pandas as pd
import numpy as np
import re
from bokeh.plotting import *
output_notebook()
def plen(x): print(len(x))
def format_column(c):
    return c.replace(' ', '_').lower()
df = pd.read_csv('salaries.csv')
df.columns = [format_column(c) for c in df.columns]
df.head()
def create_standard_location(df):
    
    def standardize(loc):
        if pd.isnull(loc):
            return ''
        return loc.split(',')[0].lower().strip()
    
    df['std_location'] = df.location.map(standardize)
    
    return df
df = create_standard_location(df)
def create_clean_salaries(df):
    def clean_individual_salary(x):    
        """
        Clean the salary data
        1. reject some malformed/difficult to parse data
        2. try to detect currency and return the type
        """    
        final_salary = np.nan
        salary_type = np.nan
        if pd.notnull(x):        
            salary_type = 'USD' # default
            x = x.lower()        
            x = re.sub("[\s,]", "", x)    
            x = re.sub("\.[0-9]*", "", x)
            if '€' in x or 'eur' in x:
                salary_type = 'Euro'
                x = re.sub(r'[€(eur)(euro)]', '', x)
            if '$' in x or 'usd' in x:            
                x = re.sub(r'[\$(usd)]', '', x)
            if '£' in x or 'gbp' in x:
                salary_type = 'GBP'
                x = x.replace('[£(gbp)]', '')                                
            try:
                final_salary = int(x)        
            except:
                #print("convert to numeric failure: '{}'".format(x))
                final_salary = np.nan
                salary_type = np.nan
        return (final_salary, salary_type)
    clean_salary_ufunc = np.frompyfunc(clean_individual_salary, 1, 2)
    final_salary, salary_type = clean_salary_ufunc(df.annual_base_pay)
    df['salary'] = final_salary
    df['salary_type'] = salary_type
    return df
def drop_unreasonable_salaries(df):    
    reasonable = df['salary'].map(lambda x: x > 20000 and x < 360000)
    print("resonable salaries:")
    print(reasonable.value_counts())    
    df = df[reasonable]
    return df
df = create_clean_salaries(df)
plen(df)
df = drop_unreasonable_salaries(df)
plen(df)
df.loc
def create_sex(df):
    
    def convert_gender(x):
        if pd.isnull(x):
            return x
        elif 'female' in x.lower():
            return 'F'
        elif 'male' in x.lower():
            return 'M'
        else:
            return 'other'
        
    df['sex'] = df.gender.map(convert_gender)
    return df
df = create_sex(df)
df = df.dropna(subset='sex'.split())
df.sex.value_counts()
cols = 'location job_title salary years_at_employer years_of_experience sex'.split()
df = df.sort_values('salary', ascending=False)
def drop_unreasonable_years(df):
    plen(df)
    df = df.dropna(subset='salary years_job years_experience'.split())
    df = df[df.years_experience > 0]
    df = df[df.years_experience < 50]
    df = df[df.years_experience < 50]
    df = df[df.years_job >= 0]
    df = df[df.years_job <= 50]
    plen(df)
    return df
df['years_job'] = pd.to_numeric(df.years_at_employer, errors='coerce')
df['years_experience'] = pd.to_numeric(df.years_of_experience, errors='coerce')
df = drop_unreasonable_years(df)
df[cols].head(5)
df[cols].tail(5)
df = df[df.salary_type == 'USD']
loc = df.location.fillna('')
sv_set = {'bay area', 'san jose', 'san francisco', 'mountain view'}
is_silicon = loc.str.lower().isin(sv_set)
sv_df = df[is_silicon]
other_df = df[~is_silicon]  
print("silicon valley:", len(sv_df))
print("other:", len(other_df))
def show_salary_info():    
    dlist = []
    def add(df, name):
        mean = df.salary.mean()
        median = df.salary.median()
        quants = list(df.salary.quantile([0.01, 0.1, 0.25, 0.75, 0.9, 0.99]))
        d = {
            'name' : name,
            'mean' : mean,
            'median': median,
            '1%' : quants[0],
            '10%' : quants[1],
            '25%' : quants[2],
            '75%' : quants[3],
            '90%' : quants[4],
            '99%' : quants[5],
        }
        dlist.append(d)
        
    sv_male_df = sv_df[sv_df.sex == 'M']
    sv_female_df = sv_df[sv_df.sex == 'F']
    other_male_df = other_df[other_df.sex == 'M']
    other_female_df = other_df[other_df.sex == 'F']
        
    add(sv_df, 'Silicon Valley')
    add(other_df, 'Non-Silicon Valley')
    add(sv_male_df, 'SV_Males')
    add(other_male_df, 'Non_SV_Males')
    add(sv_female_df, 'SV_Females')    
    add(other_female_df, 'Non_SV_Females')
    df = pd.DataFrame(dlist)
    df = df['name mean median 1% 10% 25% 75% 90% 99%'.split()]
    for c in ['mean', 'median'] + [c for c in df.columns if '%' in c]:
        df[c] = df[c].astype(int)        
    s = df.style
    s.background_gradient(cmap='coolwarm')    
        
    return s
# print("SV Salary Quantiles:")
# print(sv_df.salary.quantile([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))
# print("Non_SV Salary Quantiles:")
# print(other_df.salary.quantile([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))
show_salary_info()
F_ALPHA = 0.3
def graph_salary_vs_experience():
    p = figure(title="Salary vs Years of Experience", y_axis_label='Salary (1k)', x_axis_label='Years Experience')                      
    x = sv_df.years_experience
    y = sv_df.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Silicon Valley')
    
    x = other_df.years_experience
    y = other_df.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Other USA')
    
    show(p)
    
graph_salary_vs_experience()
def graph_salary_vs_years_at_job():
    p = figure(title="Salary vs Years at Job", y_axis_label='Salary (1k)', x_axis_label='Years at Job')                      
    x = sv_df.years_job
    y = sv_df.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Silicon Valley')
    
    x = other_df.years_job
    y = other_df.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Other USA')
    
    show(p)
    
graph_salary_vs_years_at_job()
def graph_salary_vs_gender(df):
    
    male = df[df.sex == 'M']
    female = df[df.sex == 'F']
    other = df[df.sex == 'other']
    
    p = figure(title="Salary vs Years Experience (by Gender)", y_axis_label='Salary (1k)', x_axis_label='Years Experience')                      
    x = male.years_experience
    y = male.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Male')
    
    x = female.years_experience
    y = female.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='blue', legend='Female')
    
    x = other.years_experience
    y = other.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Other')
    
    show(p)
    
graph_salary_vs_gender(df)
def regional_salaries():
    res_list = []
    for name, grp in df.groupby('std_location'):
        if len(grp) < 8 or name == '':
            continue
        res = (name, grp.salary.mean() / 1000)
        res_list.append(res)
    print(res_list)
    
    p = figure()
    p.plot_width = 1000
    p.plot_height = 800
    
    def sort_func(tup):
        return tup[1]
    
    res_list.sort(key=sort_func)
    
    x = [i for i in range(len(res_list))]
    y = [r[1] for r in res_list]
    
    p.scatter(x, y)
    texts = [r[0] for r in res_list]
    p.text(x, y, texts, angle=-0.8)
    
    for name, mean in res_list:
        pass
        
    show(p)
    
regional_salaries()