import pandas as pd
import numpy as np
import re
from bokeh.plotting import *
output_notebook()

def plen(x): print(len(x))

def format_column(c):
    return c.replace(' ', '_').lower()

df = pd.read_csv('salaries.csv')
df.columns = [format_column(c) for c in df.columns]
df.head()

def create_standard_location(df):
    
    def standardize(loc):
        if pd.isnull(loc):
            return ''
        return loc.split(',')[0].lower().strip()
    
    df['std_location'] = df.location.map(standardize)
    
    return df

df = create_standard_location(df)

def create_clean_salaries(df):

    def clean_individual_salary(x):    
        """
        Clean the salary data
        1. reject some malformed/difficult to parse data
        2. try to detect currency and return the type
        """    
        final_salary = np.nan
        salary_type = np.nan
        if pd.notnull(x):        
            salary_type = 'USD' # default
            x = x.lower()        
            x = re.sub("[\s,]", "", x)    
            x = re.sub("\.[0-9]*", "", x)
            if '€' in x or 'eur' in x:
                salary_type = 'Euro'
                x = re.sub(r'[€(eur)(euro)]', '', x)
            if '$' in x or 'usd' in x:            
                x = re.sub(r'[\$(usd)]', '', x)
            if '£' in x or 'gbp' in x:
                salary_type = 'GBP'
                x = x.replace('[£(gbp)]', '')                                
            try:
                final_salary = int(x)        

            except:
                #print("convert to numeric failure: '{}'".format(x))
                final_salary = np.nan
                salary_type = np.nan
        return (final_salary, salary_type)

    clean_salary_ufunc = np.frompyfunc(clean_individual_salary, 1, 2)
    final_salary, salary_type = clean_salary_ufunc(df.annual_base_pay)
    df['salary'] = final_salary
    df['salary_type'] = salary_type
    return df


def drop_unreasonable_salaries(df):    
    reasonable = df['salary'].map(lambda x: x > 20000 and x < 360000)
    print("resonable salaries:")
    print(reasonable.value_counts())    
    df = df[reasonable]
    return df

df = create_clean_salaries(df)
plen(df)
df = drop_unreasonable_salaries(df)
plen(df)

2187
resonable salaries:
True     1934
False     253
Name: salary, dtype: int64
1934

df.loc

<pandas.core.indexing._LocIndexer at 0x7f919088ca58>

def create_sex(df):
    
    def convert_gender(x):
        if pd.isnull(x):
            return x
        elif 'female' in x.lower():
            return 'F'
        elif 'male' in x.lower():
            return 'M'
        else:
            return 'other'
        
    df['sex'] = df.gender.map(convert_gender)
    return df

df = create_sex(df)
df = df.dropna(subset='sex'.split())
df.sex.value_counts()

/home/jrenner/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

M        1790
F          66
other      10
Name: sex, dtype: int64

cols = 'location job_title salary years_at_employer years_of_experience sex'.split()
df = df.sort_values('salary', ascending=False)

def drop_unreasonable_years(df):
    plen(df)
    df = df.dropna(subset='salary years_job years_experience'.split())
    df = df[df.years_experience > 0]
    df = df[df.years_experience < 50]
    df = df[df.years_experience < 50]
    df = df[df.years_job >= 0]
    df = df[df.years_job <= 50]
    plen(df)
    return df

df['years_job'] = pd.to_numeric(df.years_at_employer, errors='coerce')
df['years_experience'] = pd.to_numeric(df.years_of_experience, errors='coerce')
df = drop_unreasonable_years(df)

1866
1668

df[cols].head(5)

df[cols].tail(5)

df = df[df.salary_type == 'USD']
loc = df.location.fillna('')
sv_set = {'bay area', 'san jose', 'san francisco', 'mountain view'}
is_silicon = loc.str.lower().isin(sv_set)
sv_df = df[is_silicon]
other_df = df[~is_silicon]  

print("silicon valley:", len(sv_df))
print("other:", len(other_df))

silicon valley: 158
other: 1473

Basic Salary Info¶

def show_salary_info():    
    dlist = []
    def add(df, name):
        mean = df.salary.mean()
        median = df.salary.median()
        quants = list(df.salary.quantile([0.01, 0.1, 0.25, 0.75, 0.9, 0.99]))
        d = {
            'name' : name,
            'mean' : mean,
            'median': median,
            '1%' : quants[0],
            '10%' : quants[1],
            '25%' : quants[2],
            '75%' : quants[3],
            '90%' : quants[4],
            '99%' : quants[5],
        }
        dlist.append(d)
        
    sv_male_df = sv_df[sv_df.sex == 'M']
    sv_female_df = sv_df[sv_df.sex == 'F']
    other_male_df = other_df[other_df.sex == 'M']
    other_female_df = other_df[other_df.sex == 'F']
        
    add(sv_df, 'Silicon Valley')
    add(other_df, 'Non-Silicon Valley')
    add(sv_male_df, 'SV_Males')
    add(other_male_df, 'Non_SV_Males')
    add(sv_female_df, 'SV_Females')    
    add(other_female_df, 'Non_SV_Females')
    df = pd.DataFrame(dlist)
    df = df['name mean median 1% 10% 25% 75% 90% 99%'.split()]
    for c in ['mean', 'median'] + [c for c in df.columns if '%' in c]:
        df[c] = df[c].astype(int)        
    s = df.style
    s.background_gradient(cmap='coolwarm')    
        
    return s

# print("SV Salary Quantiles:")
# print(sv_df.salary.quantile([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))

# print("Non_SV Salary Quantiles:")
# print(other_df.salary.quantile([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))

show_salary_info()

Visualizations¶

F_ALPHA = 0.3

def graph_salary_vs_experience():
    p = figure(title="Salary vs Years of Experience", y_axis_label='Salary (1k)', x_axis_label='Years Experience')                      

    x = sv_df.years_experience
    y = sv_df.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Silicon Valley')
    
    x = other_df.years_experience
    y = other_df.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Other USA')
    
    show(p)
    
graph_salary_vs_experience()

def graph_salary_vs_years_at_job():
    p = figure(title="Salary vs Years at Job", y_axis_label='Salary (1k)', x_axis_label='Years at Job')                      

    x = sv_df.years_job
    y = sv_df.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Silicon Valley')
    
    x = other_df.years_job
    y = other_df.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Other USA')
    
    show(p)
    
graph_salary_vs_years_at_job()

def graph_salary_vs_gender(df):
    
    male = df[df.sex == 'M']
    female = df[df.sex == 'F']
    other = df[df.sex == 'other']
    
    p = figure(title="Salary vs Years Experience (by Gender)", y_axis_label='Salary (1k)', x_axis_label='Years Experience')                      

    x = male.years_experience
    y = male.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Male')
    
    x = female.years_experience
    y = female.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='blue', legend='Female')
    
    x = other.years_experience
    y = other.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Other')
    
    show(p)
    
graph_salary_vs_gender(df)

Let's try to see salary by common geographic locations¶

def regional_salaries():
    res_list = []
    for name, grp in df.groupby('std_location'):
        if len(grp) < 8 or name == '':
            continue
        res = (name, grp.salary.mean() / 1000)
        res_list.append(res)
    print(res_list)
    
    p = figure()
    p.plot_width = 1000
    p.plot_height = 800
    
    def sort_func(tup):
        return tup[1]
    
    res_list.sort(key=sort_func)
    
    x = [i for i in range(len(res_list))]
    y = [r[1] for r in res_list]
    
    p.scatter(x, y)
    texts = [r[0] for r in res_list]
    p.text(x, y, texts, angle=-0.8)
    
    for name, mean in res_list:
        pass
        
    show(p)
    
regional_salaries()

[('atlanta', 91.181727272727272), ('austin', 93.223684210526315), ('bay area', 134.91666666666666), ('berlin', 60.545454545454547), ('boston', 107.34480851063829), ('cambridge', 79.25), ('chicago', 97.824324324324323), ('denver', 97.763157894736835), ('houston', 127.0), ('london', 57.958333333333336), ('los angeles', 120.01851851851852), ('menlo park', 155.15384615384616), ('minneapolis', 87.44285714285715), ('mountain view', 136.18039999999999), ('new york', 125.16666666666667), ('new york city', 114.36666666666667), ('nyc', 119.0948275862069), ('palo alto', 135.45454545454544), ('philadelphia', 91.230769230769241), ('portland', 98.029411764705884), ('redmond', 122.32476190476191), ('remote', 128.79166666666669), ('salt lake city', 92.549999999999997), ('san diego', 93.57692307692308), ('san francisco', 133.69791411042945), ('san jose', 141.80769230769232), ('seattle', 120.27631578947368), ('sf', 146.59999999999999), ('sunnyvale', 123.25), ('sweden', 53.715375000000002), ('toronto', 97.390461538461537), ('vancouver', 93.400000000000006), ('washington', 143.4375), ('washington dc', 111.2)]

	timestamp	employer	location	job_title	years_at_employer	years_of_experience	annual_base_pay	signing_bonus	annual_bonus	annual_stock_value/bonus	gender	additional_comments
0	3/21/2016 12:54:49	NaN	Raleigh, NC	Software Developer	NaN	18	122000	NaN	NaN	NaN	Male	NaN
1	3/21/2016 12:58:52	Opower	San Francisco, CA	Systems Engineer	2	13	125000	5000	0	5000 shares	Male	Don't work here.
2	3/21/2016 12:58:57	NaN	San Diego CA	Systems engineer	1	1 of employment	83,000	0	$50.00	0	Male	NaN
3	3/21/2016 12:58:58	Walmart	Bentonville, AR	Senior Developer	8	15	65,000	NaN	5,000	3,000	Male	NaN
4	3/21/2016 12:59:11	Vertical Knowledge	Cleveland, OH	Software Engineer	1	4	86000	5000	6000	0	Male	NaN

	location	job_title	salary	years_at_employer	years_of_experience	sex
677	san francisco	CTO Google rail product	356000	5	12	M
1524	NaN	General Manager	350000	0.5	20	M
488	Los Gatos	Senior Software Engineer	350000	1	10	M
104	Jersey City, NJ	Consultant, CTO as needed	325000	1	8	M
510	San Francisco	CTO	310000	2	10	M

	location	job_title	salary	years_at_employer	years_of_experience	sex
1121	Madrid	Support	22500	15	15	M
1148	Cluj-Napoca, Romania	Software Engineer	21500	2	2	M
577	Bangalore, India	Developer	21496	0.5	3	M
1216	Nottingham UK	Jnr Software Developer	20500	1	1	M
841	Paris, France	Research Engineer	20400	0	1	M

	name	mean	median	1%	10%	25%	75%	90%	99%
0	Silicon Valley	134720	130000	59560	95000	110000	155750	175000	270100
1	Non-Silicon Valley	99759	95000	25000	47000	70000	125000	150000	242799
2	SV_Males	137120	132500	66290	99400	110750	160500	175000	277100
3	Non_SV_Males	100125	97000	25000	48000	70000	125000	150000	239359
4	SV_Females	99187	107500	56800	73000	87500	111020	120959	124595
5	Non_SV_Females	91765	90000	29480	52400	73000	110000	124200	206799