In [2]:
import pandas as pd
import numpy as np
import re
from bokeh.plotting import *
output_notebook()
Loading BokehJS ...
In [3]:
def plen(x): print(len(x))

def format_column(c):
    return c.replace(' ', '_').lower()

df = pd.read_csv('salaries.csv')
df.columns = [format_column(c) for c in df.columns]
df.head()
Out[3]:
timestamp employer location job_title years_at_employer years_of_experience annual_base_pay signing_bonus annual_bonus annual_stock_value/bonus gender additional_comments
0 3/21/2016 12:54:49 NaN Raleigh, NC Software Developer NaN 18 122000 NaN NaN NaN Male NaN
1 3/21/2016 12:58:52 Opower San Francisco, CA Systems Engineer 2 13 125000 5000 0 5000 shares Male Don't work here.
2 3/21/2016 12:58:57 NaN San Diego CA Systems engineer 1 1 of employment 83,000 0 $50.00 0 Male NaN
3 3/21/2016 12:58:58 Walmart Bentonville, AR Senior Developer 8 15 65,000 NaN 5,000 3,000 Male NaN
4 3/21/2016 12:59:11 Vertical Knowledge Cleveland, OH Software Engineer 1 4 86000 5000 6000 0 Male NaN
In [14]:
def create_standard_location(df):
    
    def standardize(loc):
        if pd.isnull(loc):
            return ''
        return loc.split(',')[0].lower().strip()
    
    df['std_location'] = df.location.map(standardize)
    
    return df

df = create_standard_location(df)
In [15]:
def create_clean_salaries(df):

    def clean_individual_salary(x):    
        """
        Clean the salary data
        1. reject some malformed/difficult to parse data
        2. try to detect currency and return the type
        """    
        final_salary = np.nan
        salary_type = np.nan
        if pd.notnull(x):        
            salary_type = 'USD' # default
            x = x.lower()        
            x = re.sub("[\s,]", "", x)    
            x = re.sub("\.[0-9]*", "", x)
            if '€' in x or 'eur' in x:
                salary_type = 'Euro'
                x = re.sub(r'[€(eur)(euro)]', '', x)
            if '$' in x or 'usd' in x:            
                x = re.sub(r'[\$(usd)]', '', x)
            if '£' in x or 'gbp' in x:
                salary_type = 'GBP'
                x = x.replace('[£(gbp)]', '')                                
            try:
                final_salary = int(x)        

            except:
                #print("convert to numeric failure: '{}'".format(x))
                final_salary = np.nan
                salary_type = np.nan
        return (final_salary, salary_type)

    clean_salary_ufunc = np.frompyfunc(clean_individual_salary, 1, 2)
    final_salary, salary_type = clean_salary_ufunc(df.annual_base_pay)
    df['salary'] = final_salary
    df['salary_type'] = salary_type
    return df


def drop_unreasonable_salaries(df):    
    reasonable = df['salary'].map(lambda x: x > 20000 and x < 360000)
    print("resonable salaries:")
    print(reasonable.value_counts())    
    df = df[reasonable]
    return df

df = create_clean_salaries(df)
plen(df)
df = drop_unreasonable_salaries(df)
plen(df)
2187
resonable salaries:
True     1934
False     253
Name: salary, dtype: int64
1934
In [16]:
df.loc
Out[16]:
<pandas.core.indexing._LocIndexer at 0x7f919088ca58>
In [17]:
def create_sex(df):
    
    def convert_gender(x):
        if pd.isnull(x):
            return x
        elif 'female' in x.lower():
            return 'F'
        elif 'male' in x.lower():
            return 'M'
        else:
            return 'other'
        
    df['sex'] = df.gender.map(convert_gender)
    return df

df = create_sex(df)
df = df.dropna(subset='sex'.split())
df.sex.value_counts()
/home/jrenner/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[17]:
M        1790
F          66
other      10
Name: sex, dtype: int64
In [18]:
cols = 'location job_title salary years_at_employer years_of_experience sex'.split()
df = df.sort_values('salary', ascending=False)
In [19]:
def drop_unreasonable_years(df):
    plen(df)
    df = df.dropna(subset='salary years_job years_experience'.split())
    df = df[df.years_experience > 0]
    df = df[df.years_experience < 50]
    df = df[df.years_experience < 50]
    df = df[df.years_job >= 0]
    df = df[df.years_job <= 50]
    plen(df)
    return df

df['years_job'] = pd.to_numeric(df.years_at_employer, errors='coerce')
df['years_experience'] = pd.to_numeric(df.years_of_experience, errors='coerce')
df = drop_unreasonable_years(df)
1866
1668
In [20]:
df[cols].head(5)
Out[20]:
location job_title salary years_at_employer years_of_experience sex
677 san francisco CTO Google rail product 356000 5 12 M
1524 NaN General Manager 350000 0.5 20 M
488 Los Gatos Senior Software Engineer 350000 1 10 M
104 Jersey City, NJ Consultant, CTO as needed 325000 1 8 M
510 San Francisco CTO 310000 2 10 M
In [21]:
df[cols].tail(5)
Out[21]:
location job_title salary years_at_employer years_of_experience sex
1121 Madrid Support 22500 15 15 M
1148 Cluj-Napoca, Romania Software Engineer 21500 2 2 M
577 Bangalore, India Developer 21496 0.5 3 M
1216 Nottingham UK Jnr Software Developer 20500 1 1 M
841 Paris, France Research Engineer 20400 0 1 M
In [22]:
df = df[df.salary_type == 'USD']
loc = df.location.fillna('')
sv_set = {'bay area', 'san jose', 'san francisco', 'mountain view'}
is_silicon = loc.str.lower().isin(sv_set)
sv_df = df[is_silicon]
other_df = df[~is_silicon]  

print("silicon valley:", len(sv_df))
print("other:", len(other_df))
silicon valley: 158
other: 1473

Basic Salary Info

In [23]:
def show_salary_info():    
    dlist = []
    def add(df, name):
        mean = df.salary.mean()
        median = df.salary.median()
        quants = list(df.salary.quantile([0.01, 0.1, 0.25, 0.75, 0.9, 0.99]))
        d = {
            'name' : name,
            'mean' : mean,
            'median': median,
            '1%' : quants[0],
            '10%' : quants[1],
            '25%' : quants[2],
            '75%' : quants[3],
            '90%' : quants[4],
            '99%' : quants[5],
        }
        dlist.append(d)
        
    sv_male_df = sv_df[sv_df.sex == 'M']
    sv_female_df = sv_df[sv_df.sex == 'F']
    other_male_df = other_df[other_df.sex == 'M']
    other_female_df = other_df[other_df.sex == 'F']
        
    add(sv_df, 'Silicon Valley')
    add(other_df, 'Non-Silicon Valley')
    add(sv_male_df, 'SV_Males')
    add(other_male_df, 'Non_SV_Males')
    add(sv_female_df, 'SV_Females')    
    add(other_female_df, 'Non_SV_Females')
    df = pd.DataFrame(dlist)
    df = df['name mean median 1% 10% 25% 75% 90% 99%'.split()]
    for c in ['mean', 'median'] + [c for c in df.columns if '%' in c]:
        df[c] = df[c].astype(int)        
    s = df.style
    s.background_gradient(cmap='coolwarm')    
        
    return s

# print("SV Salary Quantiles:")
# print(sv_df.salary.quantile([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))

# print("Non_SV Salary Quantiles:")
# print(other_df.salary.quantile([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]))

show_salary_info()
Out[23]:
name mean median 1% 10% 25% 75% 90% 99%
0 Silicon Valley 134720 130000 59560 95000 110000 155750 175000 270100
1 Non-Silicon Valley 99759 95000 25000 47000 70000 125000 150000 242799
2 SV_Males 137120 132500 66290 99400 110750 160500 175000 277100
3 Non_SV_Males 100125 97000 25000 48000 70000 125000 150000 239359
4 SV_Females 99187 107500 56800 73000 87500 111020 120959 124595
5 Non_SV_Females 91765 90000 29480 52400 73000 110000 124200 206799

Visualizations

In [24]:
F_ALPHA = 0.3

def graph_salary_vs_experience():
    p = figure(title="Salary vs Years of Experience", y_axis_label='Salary (1k)', x_axis_label='Years Experience')                      

    x = sv_df.years_experience
    y = sv_df.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Silicon Valley')
    
    x = other_df.years_experience
    y = other_df.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Other USA')
    
    show(p)
    
graph_salary_vs_experience()
In [25]:
def graph_salary_vs_years_at_job():
    p = figure(title="Salary vs Years at Job", y_axis_label='Salary (1k)', x_axis_label='Years at Job')                      

    x = sv_df.years_job
    y = sv_df.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Silicon Valley')
    
    x = other_df.years_job
    y = other_df.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Other USA')
    
    show(p)
    
graph_salary_vs_years_at_job()
In [26]:
def graph_salary_vs_gender(df):
    
    male = df[df.sex == 'M']
    female = df[df.sex == 'F']
    other = df[df.sex == 'other']
    
    p = figure(title="Salary vs Years Experience (by Gender)", y_axis_label='Salary (1k)', x_axis_label='Years Experience')                      

    x = male.years_experience
    y = male.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='grey', legend='Male')
    
    x = female.years_experience
    y = female.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='blue', legend='Female')
    
    x = other.years_experience
    y = other.salary / 1000
    p.scatter(x, y, size=10, fill_alpha=F_ALPHA, line_alpha=0, fill_color='red', legend='Other')
    
    show(p)
    
graph_salary_vs_gender(df)

Let's try to see salary by common geographic locations

In [64]:
def regional_salaries():
    res_list = []
    for name, grp in df.groupby('std_location'):
        if len(grp) < 8 or name == '':
            continue
        res = (name, grp.salary.mean() / 1000)
        res_list.append(res)
    print(res_list)
    
    p = figure()
    p.plot_width = 1000
    p.plot_height = 800
    
    def sort_func(tup):
        return tup[1]
    
    res_list.sort(key=sort_func)
    
    x = [i for i in range(len(res_list))]
    y = [r[1] for r in res_list]
    
    p.scatter(x, y)
    texts = [r[0] for r in res_list]
    p.text(x, y, texts, angle=-0.8)
    
    for name, mean in res_list:
        pass
        
    show(p)
    
regional_salaries()
[('atlanta', 91.181727272727272), ('austin', 93.223684210526315), ('bay area', 134.91666666666666), ('berlin', 60.545454545454547), ('boston', 107.34480851063829), ('cambridge', 79.25), ('chicago', 97.824324324324323), ('denver', 97.763157894736835), ('houston', 127.0), ('london', 57.958333333333336), ('los angeles', 120.01851851851852), ('menlo park', 155.15384615384616), ('minneapolis', 87.44285714285715), ('mountain view', 136.18039999999999), ('new york', 125.16666666666667), ('new york city', 114.36666666666667), ('nyc', 119.0948275862069), ('palo alto', 135.45454545454544), ('philadelphia', 91.230769230769241), ('portland', 98.029411764705884), ('redmond', 122.32476190476191), ('remote', 128.79166666666669), ('salt lake city', 92.549999999999997), ('san diego', 93.57692307692308), ('san francisco', 133.69791411042945), ('san jose', 141.80769230769232), ('seattle', 120.27631578947368), ('sf', 146.59999999999999), ('sunnyvale', 123.25), ('sweden', 53.715375000000002), ('toronto', 97.390461538461537), ('vancouver', 93.400000000000006), ('washington', 143.4375), ('washington dc', 111.2)]
In [ ]: