import pandas as pd
import numpy as np

df = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/LargeData/m1_survey_data.csv")

# your code goes here
print(df.duplicated().sum())
df.shape

154

(11552, 85)

# your code goes here
df = df.drop_duplicates()

# your code goes here
print(df.duplicated().sum())
df.shape

0

(11398, 85)

# your code goes here
missing_data = df.isnull()
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")

Respondent
False    11398
Name: Respondent, dtype: int64

MainBranch
False    11398
Name: MainBranch, dtype: int64

Hobbyist
False    11398
Name: Hobbyist, dtype: int64

OpenSourcer
False    11398
Name: OpenSourcer, dtype: int64

OpenSource
False    11317
True        81
Name: OpenSource, dtype: int64

Employment
False    11398
Name: Employment, dtype: int64

Country
False    11398
Name: Country, dtype: int64

Student
False    11347
True        51
Name: Student, dtype: int64

EdLevel
False    11286
True       112
Name: EdLevel, dtype: int64

UndergradMajor
False    10661
True       737
Name: UndergradMajor, dtype: int64

EduOther
False    11234
True       164
Name: EduOther, dtype: int64

OrgSize
False    11302
True        96
Name: OrgSize, dtype: int64

DevType
False    11333
True        65
Name: DevType, dtype: int64

YearsCode
False    11389
True         9
Name: YearsCode, dtype: int64

Age1stCode
False    11385
True        13
Name: Age1stCode, dtype: int64

YearsCodePro
False    11382
True        16
Name: YearsCodePro, dtype: int64

CareerSat
False    11398
Name: CareerSat, dtype: int64

JobSat
False    11397
True         1
Name: JobSat, dtype: int64

MgrIdiot
False    10905
True       493
Name: MgrIdiot, dtype: int64

MgrMoney
False    10901
True       497
Name: MgrMoney, dtype: int64

MgrWant
False    10905
True       493
Name: MgrWant, dtype: int64

JobSeek
False    11398
Name: JobSeek, dtype: int64

LastHireDate
False    11398
Name: LastHireDate, dtype: int64

LastInt
False    10985
True       413
Name: LastInt, dtype: int64

FizzBuzz
False    11361
True        37
Name: FizzBuzz, dtype: int64

JobFactors
False    11395
True         3
Name: JobFactors, dtype: int64

ResumeUpdate
False    11359
True        39
Name: ResumeUpdate, dtype: int64

CurrencySymbol
False    11398
Name: CurrencySymbol, dtype: int64

CurrencyDesc
False    11398
Name: CurrencyDesc, dtype: int64

CompTotal
False    10589
True       809
Name: CompTotal, dtype: int64

CompFreq
False    11192
True       206
Name: CompFreq, dtype: int64

ConvertedComp
False    10582
True       816
Name: ConvertedComp, dtype: int64

WorkWeekHrs
False    11276
True       122
Name: WorkWeekHrs, dtype: int64

WorkPlan
False    11277
True       121
Name: WorkPlan, dtype: int64

WorkChallenge
False    11234
True       164
Name: WorkChallenge, dtype: int64

WorkRemote
False    11390
True         8
Name: WorkRemote, dtype: int64

WorkLoc
False    11366
True        32
Name: WorkLoc, dtype: int64

ImpSyn
False    11393
True         5
Name: ImpSyn, dtype: int64

CodeRev
False    11397
True         1
Name: CodeRev, dtype: int64

CodeRevHrs
False    8972
True     2426
Name: CodeRevHrs, dtype: int64

UnitTests
False    11369
True        29
Name: UnitTests, dtype: int64

PurchaseHow
False    11202
True       196
Name: PurchaseHow, dtype: int64

PurchaseWhat
False    11360
True        38
Name: PurchaseWhat, dtype: int64

LanguageWorkedWith
False    11387
True        11
Name: LanguageWorkedWith, dtype: int64

LanguageDesireNextYear
False    11264
True       134
Name: LanguageDesireNextYear, dtype: int64

DatabaseWorkedWith
False    10945
True       453
Name: DatabaseWorkedWith, dtype: int64

DatabaseDesireNextYear
False    10356
True      1042
Name: DatabaseDesireNextYear, dtype: int64

PlatformWorkedWith
False    10987
True       411
Name: PlatformWorkedWith, dtype: int64

PlatformDesireNextYear
False    10854
True       544
Name: PlatformDesireNextYear, dtype: int64

WebFrameWorkedWith
False    10005
True      1393
Name: WebFrameWorkedWith, dtype: int64

WebFrameDesireNextYear
False    9781
True     1617
Name: WebFrameDesireNextYear, dtype: int64

MiscTechWorkedWith
False    9216
True     2182
Name: MiscTechWorkedWith, dtype: int64

MiscTechDesireNextYear
False    9943
True     1455
Name: MiscTechDesireNextYear, dtype: int64

DevEnviron
False    11369
True        29
Name: DevEnviron, dtype: int64

OpSys
False    11364
True        34
Name: OpSys, dtype: int64

Containers
False    11316
True        82
Name: Containers, dtype: int64

BlockchainOrg
False    9076
True     2322
Name: BlockchainOrg, dtype: int64

BlockchainIs
False    8788
True     2610
Name: BlockchainIs, dtype: int64

BetterLife
False    11300
True        98
Name: BetterLife, dtype: int64

ITperson
False    11363
True        35
Name: ITperson, dtype: int64

OffOn
False    11360
True        38
Name: OffOn, dtype: int64

SocialMedia
False    11105
True       293
Name: SocialMedia, dtype: int64

Extraversion
False    11378
True        20
Name: Extraversion, dtype: int64

ScreenName
False    10891
True       507
Name: ScreenName, dtype: int64

SOVisit1st
False    11073
True       325
Name: SOVisit1st, dtype: int64

SOVisitFreq
False    11393
True         5
Name: SOVisitFreq, dtype: int64

SOVisitTo
False    11397
True         1
Name: SOVisitTo, dtype: int64

SOFindAnswer
False    11395
True         3
Name: SOFindAnswer, dtype: int64

SOTimeSaved
False    11348
True        50
Name: SOTimeSaved, dtype: int64

SOHowMuchTime
False    9481
True     1917
Name: SOHowMuchTime, dtype: int64

SOAccount
False    11397
True         1
Name: SOAccount, dtype: int64

SOPartFreq
False    10270
True      1128
Name: SOPartFreq, dtype: int64

SOJobs
False    11392
True         6
Name: SOJobs, dtype: int64

EntTeams
False    11393
True         5
Name: EntTeams, dtype: int64

SOComm
False    11398
Name: SOComm, dtype: int64

WelcomeChange
False    11313
True        85
Name: WelcomeChange, dtype: int64

SONewContent
False    9433
True     1965
Name: SONewContent, dtype: int64

Age
False    11111
True       287
Name: Age, dtype: int64

Gender
False    11325
True        73
Name: Gender, dtype: int64

Trans
False    11275
True       123
Name: Trans, dtype: int64

Sexuality
False    10856
True       542
Name: Sexuality, dtype: int64

Ethnicity
False    10723
True       675
Name: Ethnicity, dtype: int64

Dependents
False    11258
True       140
Name: Dependents, dtype: int64

SurveyLength
False    11379
True        19
Name: SurveyLength, dtype: int64

SurveyEase
False    11384
True        14
Name: SurveyEase, dtype: int64

# your code goes here
df["WorkLoc"].isnull().sum()

32

# your code goes here
df["WorkLoc"].value_counts()

Office                                            6806
Home                                              3589
Other place, such as a coworking space or cafe     971
Name: WorkLoc, dtype: int64

#make a note of the majority value here, for future reference
#'Office' has majority value which is 6806.
majority_WorkLoc =df["WorkLoc"].value_counts().idxmax()

# your code goes here
df["WorkLoc"].replace(np.nan, majority_WorkLoc, inplace=True)

# your code goes here
df["WorkLoc"].isnull().sum()

0

# your code goes here
df["CompFreq"].value_counts()

Yearly     6073
Monthly    4788
Weekly      331
Name: CompFreq, dtype: int64

# your code goes here
df.loc[df['CompFreq'] == "Yearly", 'NormalizedAnnualCompensation'] = df['CompTotal']
df.loc[df['CompFreq'] == "Monthly", 'NormalizedAnnualCompensation'] = df['CompTotal']*12
df.loc[df['CompFreq'] == "Weekly", 'NormalizedAnnualCompensation'] = df['CompTotal']*52

df.head()

df["NormalizedAnnualCompensation"].median()

100000.0

	Respondent	MainBranch	Hobbyist	OpenSourcer	OpenSource	Employment	Country	Student	EdLevel	UndergradMajor	...	SONewContent	Age	Gender	Trans	Sexuality	Ethnicity	Dependents	SurveyLength	SurveyEase	NormalizedAnnualCompensation
0	4	I am a developer by profession	No	Never	The quality of OSS and closed source software ...	Employed full-time	United States	No	Bachelor’s degree (BA, BS, B.Eng., etc.)	Computer science, computer engineering, or sof...	...	Tech articles written by other developers;Indu...	22.0	Man	No	Straight / Heterosexual	White or of European descent	No	Appropriate in length	Easy	61000.0
1	9	I am a developer by profession	Yes	Once a month or more often	The quality of OSS and closed source software ...	Employed full-time	New Zealand	No	Some college/university study without earning ...	Computer science, computer engineering, or sof...	...	NaN	23.0	Man	No	Bisexual	White or of European descent	No	Appropriate in length	Neither easy nor difficult	138000.0
2	13	I am a developer by profession	Yes	Less than once a month but more than once per ...	OSS is, on average, of HIGHER quality than pro...	Employed full-time	United States	No	Master’s degree (MA, MS, M.Eng., MBA, etc.)	Computer science, computer engineering, or sof...	...	Tech articles written by other developers;Cour...	28.0	Man	No	Straight / Heterosexual	White or of European descent	Yes	Appropriate in length	Easy	90000.0
3	16	I am a developer by profession	Yes	Never	The quality of OSS and closed source software ...	Employed full-time	United Kingdom	No	Master’s degree (MA, MS, M.Eng., MBA, etc.)	NaN	...	Tech articles written by other developers;Indu...	26.0	Man	No	Straight / Heterosexual	White or of European descent	No	Appropriate in length	Neither easy nor difficult	348000.0
4	17	I am a developer by profession	Yes	Less than once a month but more than once per ...	The quality of OSS and closed source software ...	Employed full-time	Australia	No	Bachelor’s degree (BA, BS, B.Eng., etc.)	Computer science, computer engineering, or sof...	...	Tech articles written by other developers;Indu...	29.0	Man	No	Straight / Heterosexual	Hispanic or Latino/Latina;Multiracial	No	Appropriate in length	Easy	90000.0

Data Wrangling¶

Data Wrangling¶

Objectives¶

Hands on Lab¶

Finding duplicates¶

Removing duplicates¶

Finding Missing values¶

Imputing missing values¶

Normalizing data¶