{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Import Libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Read Data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"ibm = pd.read_csv('/WA_Fn-UseC_-HR-Employee-Attrition.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dateset Information"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1470, 35)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ibm.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>DailyRate</th>\n",
" <th>DistanceFromHome</th>\n",
" <th>Education</th>\n",
" <th>EmployeeCount</th>\n",
" <th>EmployeeNumber</th>\n",
" <th>EnvironmentSatisfaction</th>\n",
" <th>HourlyRate</th>\n",
" <th>JobInvolvement</th>\n",
" <th>JobLevel</th>\n",
" <th>JobSatisfaction</th>\n",
" <th>MonthlyIncome</th>\n",
" <th>MonthlyRate</th>\n",
" <th>NumCompaniesWorked</th>\n",
" <th>PercentSalaryHike</th>\n",
" <th>PerformanceRating</th>\n",
" <th>RelationshipSatisfaction</th>\n",
" <th>StandardHours</th>\n",
" <th>StockOptionLevel</th>\n",
" <th>TotalWorkingYears</th>\n",
" <th>TrainingTimesLastYear</th>\n",
" <th>WorkLifeBalance</th>\n",
" <th>YearsAtCompany</th>\n",
" <th>YearsInCurrentRole</th>\n",
" <th>YearsSinceLastPromotion</th>\n",
" <th>YearsWithCurrManager</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.0</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.0</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" <td>1470.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>36.923810</td>\n",
" <td>802.485714</td>\n",
" <td>9.192517</td>\n",
" <td>2.912925</td>\n",
" <td>1.0</td>\n",
" <td>1024.865306</td>\n",
" <td>2.721769</td>\n",
" <td>65.891156</td>\n",
" <td>2.729932</td>\n",
" <td>2.063946</td>\n",
" <td>2.728571</td>\n",
" <td>6502.931293</td>\n",
" <td>14313.103401</td>\n",
" <td>2.693197</td>\n",
" <td>15.209524</td>\n",
" <td>3.153741</td>\n",
" <td>2.712245</td>\n",
" <td>80.0</td>\n",
" <td>0.793878</td>\n",
" <td>11.279592</td>\n",
" <td>2.799320</td>\n",
" <td>2.761224</td>\n",
" <td>7.008163</td>\n",
" <td>4.229252</td>\n",
" <td>2.187755</td>\n",
" <td>4.123129</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>9.135373</td>\n",
" <td>403.509100</td>\n",
" <td>8.106864</td>\n",
" <td>1.024165</td>\n",
" <td>0.0</td>\n",
" <td>602.024335</td>\n",
" <td>1.093082</td>\n",
" <td>20.329428</td>\n",
" <td>0.711561</td>\n",
" <td>1.106940</td>\n",
" <td>1.102846</td>\n",
" <td>4707.956783</td>\n",
" <td>7117.786044</td>\n",
" <td>2.498009</td>\n",
" <td>3.659938</td>\n",
" <td>0.360824</td>\n",
" <td>1.081209</td>\n",
" <td>0.0</td>\n",
" <td>0.852077</td>\n",
" <td>7.780782</td>\n",
" <td>1.289271</td>\n",
" <td>0.706476</td>\n",
" <td>6.126525</td>\n",
" <td>3.623137</td>\n",
" <td>3.222430</td>\n",
" <td>3.568136</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>18.000000</td>\n",
" <td>102.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>30.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1009.000000</td>\n",
" <td>2094.000000</td>\n",
" <td>0.000000</td>\n",
" <td>11.000000</td>\n",
" <td>3.000000</td>\n",
" <td>1.000000</td>\n",
" <td>80.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>30.000000</td>\n",
" <td>465.000000</td>\n",
" <td>2.000000</td>\n",
" <td>2.000000</td>\n",
" <td>1.0</td>\n",
" <td>491.250000</td>\n",
" <td>2.000000</td>\n",
" <td>48.000000</td>\n",
" <td>2.000000</td>\n",
" <td>1.000000</td>\n",
" <td>2.000000</td>\n",
" <td>2911.000000</td>\n",
" <td>8047.000000</td>\n",
" <td>1.000000</td>\n",
" <td>12.000000</td>\n",
" <td>3.000000</td>\n",
" <td>2.000000</td>\n",
" <td>80.0</td>\n",
" <td>0.000000</td>\n",
" <td>6.000000</td>\n",
" <td>2.000000</td>\n",
" <td>2.000000</td>\n",
" <td>3.000000</td>\n",
" <td>2.000000</td>\n",
" <td>0.000000</td>\n",
" <td>2.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>36.000000</td>\n",
" <td>802.000000</td>\n",
" <td>7.000000</td>\n",
" <td>3.000000</td>\n",
" <td>1.0</td>\n",
" <td>1020.500000</td>\n",
" <td>3.000000</td>\n",
" <td>66.000000</td>\n",
" <td>3.000000</td>\n",
" <td>2.000000</td>\n",
" <td>3.000000</td>\n",
" <td>4919.000000</td>\n",
" <td>14235.500000</td>\n",
" <td>2.000000</td>\n",
" <td>14.000000</td>\n",
" <td>3.000000</td>\n",
" <td>3.000000</td>\n",
" <td>80.0</td>\n",
" <td>1.000000</td>\n",
" <td>10.000000</td>\n",
" <td>3.000000</td>\n",
" <td>3.000000</td>\n",
" <td>5.000000</td>\n",
" <td>3.000000</td>\n",
" <td>1.000000</td>\n",
" <td>3.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>43.000000</td>\n",
" <td>1157.000000</td>\n",
" <td>14.000000</td>\n",
" <td>4.000000</td>\n",
" <td>1.0</td>\n",
" <td>1555.750000</td>\n",
" <td>4.000000</td>\n",
" <td>83.750000</td>\n",
" <td>3.000000</td>\n",
" <td>3.000000</td>\n",
" <td>4.000000</td>\n",
" <td>8379.000000</td>\n",
" <td>20461.500000</td>\n",
" <td>4.000000</td>\n",
" <td>18.000000</td>\n",
" <td>3.000000</td>\n",
" <td>4.000000</td>\n",
" <td>80.0</td>\n",
" <td>1.000000</td>\n",
" <td>15.000000</td>\n",
" <td>3.000000</td>\n",
" <td>3.000000</td>\n",
" <td>9.000000</td>\n",
" <td>7.000000</td>\n",
" <td>3.000000</td>\n",
" <td>7.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>60.000000</td>\n",
" <td>1499.000000</td>\n",
" <td>29.000000</td>\n",
" <td>5.000000</td>\n",
" <td>1.0</td>\n",
" <td>2068.000000</td>\n",
" <td>4.000000</td>\n",
" <td>100.000000</td>\n",
" <td>4.000000</td>\n",
" <td>5.000000</td>\n",
" <td>4.000000</td>\n",
" <td>19999.000000</td>\n",
" <td>26999.000000</td>\n",
" <td>9.000000</td>\n",
" <td>25.000000</td>\n",
" <td>4.000000</td>\n",
" <td>4.000000</td>\n",
" <td>80.0</td>\n",
" <td>3.000000</td>\n",
" <td>40.000000</td>\n",
" <td>6.000000</td>\n",
" <td>4.000000</td>\n",
" <td>40.000000</td>\n",
" <td>18.000000</td>\n",
" <td>15.000000</td>\n",
" <td>17.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Age DailyRate DistanceFromHome Education EmployeeCount \\\n",
"count 1470.000000 1470.000000 1470.000000 1470.000000 1470.0 \n",
"mean 36.923810 802.485714 9.192517 2.912925 1.0 \n",
"std 9.135373 403.509100 8.106864 1.024165 0.0 \n",
"min 18.000000 102.000000 1.000000 1.000000 1.0 \n",
"25% 30.000000 465.000000 2.000000 2.000000 1.0 \n",
"50% 36.000000 802.000000 7.000000 3.000000 1.0 \n",
"75% 43.000000 1157.000000 14.000000 4.000000 1.0 \n",
"max 60.000000 1499.000000 29.000000 5.000000 1.0 \n",
"\n",
" EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement \\\n",
"count 1470.000000 1470.000000 1470.000000 1470.000000 \n",
"mean 1024.865306 2.721769 65.891156 2.729932 \n",
"std 602.024335 1.093082 20.329428 0.711561 \n",
"min 1.000000 1.000000 30.000000 1.000000 \n",
"25% 491.250000 2.000000 48.000000 2.000000 \n",
"50% 1020.500000 3.000000 66.000000 3.000000 \n",
"75% 1555.750000 4.000000 83.750000 3.000000 \n",
"max 2068.000000 4.000000 100.000000 4.000000 \n",
"\n",
" JobLevel JobSatisfaction MonthlyIncome MonthlyRate \\\n",
"count 1470.000000 1470.000000 1470.000000 1470.000000 \n",
"mean 2.063946 2.728571 6502.931293 14313.103401 \n",
"std 1.106940 1.102846 4707.956783 7117.786044 \n",
"min 1.000000 1.000000 1009.000000 2094.000000 \n",
"25% 1.000000 2.000000 2911.000000 8047.000000 \n",
"50% 2.000000 3.000000 4919.000000 14235.500000 \n",
"75% 3.000000 4.000000 8379.000000 20461.500000 \n",
"max 5.000000 4.000000 19999.000000 26999.000000 \n",
"\n",
" NumCompaniesWorked PercentSalaryHike PerformanceRating \\\n",
"count 1470.000000 1470.000000 1470.000000 \n",
"mean 2.693197 15.209524 3.153741 \n",
"std 2.498009 3.659938 0.360824 \n",
"min 0.000000 11.000000 3.000000 \n",
"25% 1.000000 12.000000 3.000000 \n",
"50% 2.000000 14.000000 3.000000 \n",
"75% 4.000000 18.000000 3.000000 \n",
"max 9.000000 25.000000 4.000000 \n",
"\n",
" RelationshipSatisfaction StandardHours StockOptionLevel \\\n",
"count 1470.000000 1470.0 1470.000000 \n",
"mean 2.712245 80.0 0.793878 \n",
"std 1.081209 0.0 0.852077 \n",
"min 1.000000 80.0 0.000000 \n",
"25% 2.000000 80.0 0.000000 \n",
"50% 3.000000 80.0 1.000000 \n",
"75% 4.000000 80.0 1.000000 \n",
"max 4.000000 80.0 3.000000 \n",
"\n",
" TotalWorkingYears TrainingTimesLastYear WorkLifeBalance \\\n",
"count 1470.000000 1470.000000 1470.000000 \n",
"mean 11.279592 2.799320 2.761224 \n",
"std 7.780782 1.289271 0.706476 \n",
"min 0.000000 0.000000 1.000000 \n",
"25% 6.000000 2.000000 2.000000 \n",
"50% 10.000000 3.000000 3.000000 \n",
"75% 15.000000 3.000000 3.000000 \n",
"max 40.000000 6.000000 4.000000 \n",
"\n",
" YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion \\\n",
"count 1470.000000 1470.000000 1470.000000 \n",
"mean 7.008163 4.229252 2.187755 \n",
"std 6.126525 3.623137 3.222430 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 3.000000 2.000000 0.000000 \n",
"50% 5.000000 3.000000 1.000000 \n",
"75% 9.000000 7.000000 3.000000 \n",
"max 40.000000 18.000000 15.000000 \n",
"\n",
" YearsWithCurrManager \n",
"count 1470.000000 \n",
"mean 4.123129 \n",
"std 3.568136 \n",
"min 0.000000 \n",
"25% 2.000000 \n",
"50% 3.000000 \n",
"75% 7.000000 \n",
"max 17.000000 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ibm.describe()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Age mode: 35\n",
"Attrition mode: No\n",
"BusinessTravel mode: Travel_Rarely\n",
"DailyRate mode: 691\n",
"Department mode: Research & Development\n",
"DistanceFromHome mode: 2\n",
"Education mode: 3\n",
"EducationField mode: Life Sciences\n",
"EmployeeCount mode: 1\n",
"EmployeeNumber mode: 1\n",
"EnvironmentSatisfaction mode: 3\n",
"Gender mode: Male\n",
"HourlyRate mode: 66\n",
"JobInvolvement mode: 3\n",
"JobLevel mode: 1\n",
"JobRole mode: Sales Executive\n",
"JobSatisfaction mode: 4\n",
"MaritalStatus mode: Married\n",
"MonthlyIncome mode: 2342\n",
"MonthlyRate mode: 9150\n",
"NumCompaniesWorked mode: 1\n",
"Over18 mode: Y\n",
"OverTime mode: No\n",
"PercentSalaryHike mode: 11\n",
"PerformanceRating mode: 3\n",
"RelationshipSatisfaction mode: 3\n",
"StandardHours mode: 80\n",
"StockOptionLevel mode: 0\n",
"TotalWorkingYears mode: 10\n",
"TrainingTimesLastYear mode: 2\n",
"WorkLifeBalance mode: 3\n",
"YearsAtCompany mode: 5\n",
"YearsInCurrentRole mode: 2\n",
"YearsSinceLastPromotion mode: 0\n",
"YearsWithCurrManager mode: 2\n"
]
}
],
"source": [
"import statistics\n",
"for i in ibm.columns:\n",
" print(i, \" mode: \", statistics.mode(ibm[i]));"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 1470 entries, 0 to 1469\n",
"Data columns (total 35 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Age 1470 non-null int64 \n",
" 1 Attrition 1470 non-null object\n",
" 2 BusinessTravel 1470 non-null object\n",
" 3 DailyRate 1470 non-null int64 \n",
" 4 Department 1470 non-null object\n",
" 5 DistanceFromHome 1470 non-null int64 \n",
" 6 Education 1470 non-null int64 \n",
" 7 EducationField 1470 non-null object\n",
" 8 EmployeeCount 1470 non-null int64 \n",
" 9 EmployeeNumber 1470 non-null int64 \n",
" 10 EnvironmentSatisfaction 1470 non-null int64 \n",
" 11 Gender 1470 non-null object\n",
" 12 HourlyRate 1470 non-null int64 \n",
" 13 JobInvolvement 1470 non-null int64 \n",
" 14 JobLevel 1470 non-null int64 \n",
" 15 JobRole 1470 non-null object\n",
" 16 JobSatisfaction 1470 non-null int64 \n",
" 17 MaritalStatus 1470 non-null object\n",
" 18 MonthlyIncome 1470 non-null int64 \n",
" 19 MonthlyRate 1470 non-null int64 \n",
" 20 NumCompaniesWorked 1470 non-null int64 \n",
" 21 Over18 1470 non-null object\n",
" 22 OverTime 1470 non-null object\n",
" 23 PercentSalaryHike 1470 non-null int64 \n",
" 24 PerformanceRating 1470 non-null int64 \n",
" 25 RelationshipSatisfaction 1470 non-null int64 \n",
" 26 StandardHours 1470 non-null int64 \n",
" 27 StockOptionLevel 1470 non-null int64 \n",
" 28 TotalWorkingYears 1470 non-null int64 \n",
" 29 TrainingTimesLastYear 1470 non-null int64 \n",
" 30 WorkLifeBalance 1470 non-null int64 \n",
" 31 YearsAtCompany 1470 non-null int64 \n",
" 32 YearsInCurrentRole 1470 non-null int64 \n",
" 33 YearsSinceLastPromotion 1470 non-null int64 \n",
" 34 YearsWithCurrManager 1470 non-null int64 \n",
"dtypes: int64(26), object(9)\n",
"memory usage: 402.1+ KB\n"
]
}
],
"source": [
"ibm.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Data Preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"ibm.drop(columns = 'EmployeeCount', inplace = True)\n",
"ibm.drop(columns = 'EmployeeNumber', inplace = True)\n",
"ibm.drop(columns = 'Over18', inplace = True)\n",
"ibm.drop(columns = 'StandardHours', inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Attrition</th>\n",
" <th>BusinessTravel</th>\n",
" <th>DailyRate</th>\n",
" <th>Department</th>\n",
" <th>DistanceFromHome</th>\n",
" <th>Education</th>\n",
" <th>EducationField</th>\n",
" <th>EnvironmentSatisfaction</th>\n",
" <th>Gender</th>\n",
" <th>HourlyRate</th>\n",
" <th>JobInvolvement</th>\n",
" <th>JobLevel</th>\n",
" <th>JobRole</th>\n",
" <th>JobSatisfaction</th>\n",
" <th>MaritalStatus</th>\n",
" <th>MonthlyIncome</th>\n",
" <th>MonthlyRate</th>\n",
" <th>NumCompaniesWorked</th>\n",
" <th>OverTime</th>\n",
" <th>PercentSalaryHike</th>\n",
" <th>PerformanceRating</th>\n",
" <th>RelationshipSatisfaction</th>\n",
" <th>StockOptionLevel</th>\n",
" <th>TotalWorkingYears</th>\n",
" <th>TrainingTimesLastYear</th>\n",
" <th>WorkLifeBalance</th>\n",
" <th>YearsAtCompany</th>\n",
" <th>YearsInCurrentRole</th>\n",
" <th>YearsSinceLastPromotion</th>\n",
" <th>YearsWithCurrManager</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>41</td>\n",
" <td>Yes</td>\n",
" <td>Travel_Rarely</td>\n",
" <td>1102</td>\n",
" <td>Sales</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>Life Sciences</td>\n",
" <td>2</td>\n",
" <td>Female</td>\n",
" <td>94</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>Sales Executive</td>\n",
" <td>4</td>\n",
" <td>Single</td>\n",
" <td>5993</td>\n",
" <td>19479</td>\n",
" <td>8</td>\n",
" <td>Yes</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>49</td>\n",
" <td>No</td>\n",
" <td>Travel_Frequently</td>\n",
" <td>279</td>\n",
" <td>Research & Development</td>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" <td>Life Sciences</td>\n",
" <td>3</td>\n",
" <td>Male</td>\n",
" <td>61</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>Research Scientist</td>\n",
" <td>2</td>\n",
" <td>Married</td>\n",
" <td>5130</td>\n",
" <td>24907</td>\n",
" <td>1</td>\n",
" <td>No</td>\n",
" <td>23</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>10</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>37</td>\n",
" <td>Yes</td>\n",
" <td>Travel_Rarely</td>\n",
" <td>1373</td>\n",
" <td>Research & Development</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>Other</td>\n",
" <td>4</td>\n",
" <td>Male</td>\n",
" <td>92</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>Laboratory Technician</td>\n",
" <td>3</td>\n",
" <td>Single</td>\n",
" <td>2090</td>\n",
" <td>2396</td>\n",
" <td>6</td>\n",
" <td>Yes</td>\n",
" <td>15</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>33</td>\n",
" <td>No</td>\n",
" <td>Travel_Frequently</td>\n",
" <td>1392</td>\n",
" <td>Research & Development</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>Life Sciences</td>\n",
" <td>4</td>\n",
" <td>Female</td>\n",
" <td>56</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Research Scientist</td>\n",
" <td>3</td>\n",
" <td>Married</td>\n",
" <td>2909</td>\n",
" <td>23159</td>\n",
" <td>1</td>\n",
" <td>Yes</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>27</td>\n",
" <td>No</td>\n",
" <td>Travel_Rarely</td>\n",
" <td>591</td>\n",
" <td>Research & Development</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>Medical</td>\n",
" <td>1</td>\n",
" <td>Male</td>\n",
" <td>40</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>Laboratory Technician</td>\n",
" <td>2</td>\n",
" <td>Married</td>\n",
" <td>3468</td>\n",
" <td>16632</td>\n",
" <td>9</td>\n",
" <td>No</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1465</th>\n",
" <td>36</td>\n",
" <td>No</td>\n",
" <td>Travel_Frequently</td>\n",
" <td>884</td>\n",
" <td>Research & Development</td>\n",
" <td>23</td>\n",
" <td>2</td>\n",
" <td>Medical</td>\n",
" <td>3</td>\n",
" <td>Male</td>\n",
" <td>41</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>Laboratory Technician</td>\n",
" <td>4</td>\n",
" <td>Married</td>\n",
" <td>2571</td>\n",
" <td>12290</td>\n",
" <td>4</td>\n",
" <td>No</td>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1466</th>\n",
" <td>39</td>\n",
" <td>No</td>\n",
" <td>Travel_Rarely</td>\n",
" <td>613</td>\n",
" <td>Research & Development</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>Medical</td>\n",
" <td>4</td>\n",
" <td>Male</td>\n",
" <td>42</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>Healthcare Representative</td>\n",
" <td>1</td>\n",
" <td>Married</td>\n",
" <td>9991</td>\n",
" <td>21457</td>\n",
" <td>4</td>\n",
" <td>No</td>\n",
" <td>15</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1467</th>\n",
" <td>27</td>\n",
" <td>No</td>\n",
" <td>Travel_Rarely</td>\n",
" <td>155</td>\n",
" <td>Research & Development</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>Life Sciences</td>\n",
" <td>2</td>\n",
" <td>Male</td>\n",
" <td>87</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>Manufacturing Director</td>\n",
" <td>2</td>\n",
" <td>Married</td>\n",
" <td>6142</td>\n",
" <td>5174</td>\n",
" <td>1</td>\n",
" <td>Yes</td>\n",
" <td>20</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1468</th>\n",
" <td>49</td>\n",
" <td>No</td>\n",
" <td>Travel_Frequently</td>\n",
" <td>1023</td>\n",
" <td>Sales</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>Medical</td>\n",
" <td>4</td>\n",
" <td>Male</td>\n",
" <td>63</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>Sales Executive</td>\n",
" <td>2</td>\n",
" <td>Married</td>\n",
" <td>5390</td>\n",
" <td>13243</td>\n",
" <td>2</td>\n",
" <td>No</td>\n",
" <td>14</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1469</th>\n",
" <td>34</td>\n",
" <td>No</td>\n",
" <td>Travel_Rarely</td>\n",
" <td>628</td>\n",
" <td>Research & Development</td>\n",
" <td>8</td>\n",
" <td>3</td>\n",
" <td>Medical</td>\n",
" <td>2</td>\n",
" <td>Male</td>\n",
" <td>82</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>Laboratory Technician</td>\n",
" <td>3</td>\n",
" <td>Married</td>\n",
" <td>4404</td>\n",
" <td>10228</td>\n",
" <td>2</td>\n",
" <td>No</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1470 rows × 31 columns</p>\n",
"</div>"
],
"text/plain": [
" Age Attrition BusinessTravel DailyRate Department \\\n",
"0 41 Yes Travel_Rarely 1102 Sales \n",
"1 49 No Travel_Frequently 279 Research & Development \n",
"2 37 Yes Travel_Rarely 1373 Research & Development \n",
"3 33 No Travel_Frequently 1392 Research & Development \n",
"4 27 No Travel_Rarely 591 Research & Development \n",
"... ... ... ... ... ... \n",
"1465 36 No Travel_Frequently 884 Research & Development \n",
"1466 39 No Travel_Rarely 613 Research & Development \n",
"1467 27 No Travel_Rarely 155 Research & Development \n",
"1468 49 No Travel_Frequently 1023 Sales \n",
"1469 34 No Travel_Rarely 628 Research & Development \n",
"\n",
" DistanceFromHome Education EducationField EnvironmentSatisfaction \\\n",
"0 1 2 Life Sciences 2 \n",
"1 8 1 Life Sciences 3 \n",
"2 2 2 Other 4 \n",
"3 3 4 Life Sciences 4 \n",
"4 2 1 Medical 1 \n",
"... ... ... ... ... \n",
"1465 23 2 Medical 3 \n",
"1466 6 1 Medical 4 \n",
"1467 4 3 Life Sciences 2 \n",
"1468 2 3 Medical 4 \n",
"1469 8 3 Medical 2 \n",
"\n",
" Gender HourlyRate JobInvolvement JobLevel JobRole \\\n",
"0 Female 94 3 2 Sales Executive \n",
"1 Male 61 2 2 Research Scientist \n",
"2 Male 92 2 1 Laboratory Technician \n",
"3 Female 56 3 1 Research Scientist \n",
"4 Male 40 3 1 Laboratory Technician \n",
"... ... ... ... ... ... \n",
"1465 Male 41 4 2 Laboratory Technician \n",
"1466 Male 42 2 3 Healthcare Representative \n",
"1467 Male 87 4 2 Manufacturing Director \n",
"1468 Male 63 2 2 Sales Executive \n",
"1469 Male 82 4 2 Laboratory Technician \n",
"\n",
" JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate \\\n",
"0 4 Single 5993 19479 \n",
"1 2 Married 5130 24907 \n",
"2 3 Single 2090 2396 \n",
"3 3 Married 2909 23159 \n",
"4 2 Married 3468 16632 \n",
"... ... ... ... ... \n",
"1465 4 Married 2571 12290 \n",
"1466 1 Married 9991 21457 \n",
"1467 2 Married 6142 5174 \n",
"1468 2 Married 5390 13243 \n",
"1469 3 Married 4404 10228 \n",
"\n",
" NumCompaniesWorked OverTime PercentSalaryHike PerformanceRating \\\n",
"0 8 Yes 11 3 \n",
"1 1 No 23 4 \n",
"2 6 Yes 15 3 \n",
"3 1 Yes 11 3 \n",
"4 9 No 12 3 \n",
"... ... ... ... ... \n",
"1465 4 No 17 3 \n",
"1466 4 No 15 3 \n",
"1467 1 Yes 20 4 \n",
"1468 2 No 14 3 \n",
"1469 2 No 12 3 \n",
"\n",
" RelationshipSatisfaction StockOptionLevel TotalWorkingYears \\\n",
"0 1 0 8 \n",
"1 4 1 10 \n",
"2 2 0 7 \n",
"3 3 0 8 \n",
"4 4 1 6 \n",
"... ... ... ... \n",
"1465 3 1 17 \n",
"1466 1 1 9 \n",
"1467 2 1 6 \n",
"1468 4 0 17 \n",
"1469 1 0 6 \n",
"\n",
" TrainingTimesLastYear WorkLifeBalance YearsAtCompany \\\n",
"0 0 1 6 \n",
"1 3 3 10 \n",
"2 3 3 0 \n",
"3 3 3 8 \n",
"4 3 3 2 \n",
"... ... ... ... \n",
"1465 3 3 5 \n",
"1466 5 3 7 \n",
"1467 0 3 6 \n",
"1468 3 2 9 \n",
"1469 3 4 4 \n",
"\n",
" YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager \n",
"0 4 0 5 \n",
"1 7 1 7 \n",
"2 0 0 0 \n",
"3 7 3 0 \n",
"4 2 2 2 \n",
"... ... ... ... \n",
"1465 2 0 3 \n",
"1466 7 1 7 \n",
"1467 2 0 3 \n",
"1468 6 0 8 \n",
"1469 3 1 2 \n",
"\n",
"[1470 rows x 31 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ibm.drop_duplicates()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Age 0\n",
"Attrition 0\n",
"BusinessTravel 0\n",
"DailyRate 0\n",
"Department 0\n",
"DistanceFromHome 0\n",
"Education 0\n",
"EducationField 0\n",
"EnvironmentSatisfaction 0\n",
"Gender 0\n",
"HourlyRate 0\n",
"JobInvolvement 0\n",
"JobLevel 0\n",
"JobRole 0\n",
"JobSatisfaction 0\n",
"MaritalStatus 0\n",
"MonthlyIncome 0\n",
"MonthlyRate 0\n",
"NumCompaniesWorked 0\n",
"OverTime 0\n",
"PercentSalaryHike 0\n",
"PerformanceRating 0\n",
"RelationshipSatisfaction 0\n",
"StockOptionLevel 0\n",
"TotalWorkingYears 0\n",
"TrainingTimesLastYear 0\n",
"WorkLifeBalance 0\n",
"YearsAtCompany 0\n",
"YearsInCurrentRole 0\n",
"YearsSinceLastPromotion 0\n",
"YearsWithCurrManager 0\n",
"dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ibm.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# replace Attrition (0 - No, 1 - Yes)\n",
"ibm.replace({'Attrition' : {'Yes': 1, 'No': 0}}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"# replace BusinessTravel (0 - Non-Travel, 1 - Travel_Rarely, 2 - Travel_Frequently)\n",
"ibm.replace({'BusinessTravel' : {'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2}}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"#Department\n",
"dummy = pd.get_dummies(ibm['Department'])\n",
"ibm.insert(5,'Dp_Sales&Development', dummy['Research & Development'])\n",
"ibm.insert(6,'Dp_Sales', dummy['Sales'])\n",
"ibm.insert(7,'Dp_HumanResources', dummy['Human Resources'])\n",
"\n",
"ibm.drop(columns = 'Department', inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"#EducationField\n",
"dummy = pd.get_dummies(ibm['EducationField'])\n",
"ibm.insert(11,'EF_Life Sciences',dummy['Life Sciences'])\n",
"ibm.insert(12,'EF_Medical',dummy['Medical'])\n",
"ibm.insert(13,'EF_Marketing',dummy['Marketing'])\n",
"ibm.insert(14,'EF_TechnicalDegree',dummy['Technical Degree'])\n",
"ibm.insert(15,'EF_HumanResources',dummy['Human Resources'])\n",
"ibm.insert(16,'EF_Other',dummy['Other'])\n",
"\n",
"ibm.drop(columns = 'EducationField', inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# replace Gender (0 - Male; 1 - Female)\n",
"ibm.replace({'Gender': {'Male': 0, 'Female': 1}}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# Job role dummy variables\n",
"dummy=pd.get_dummies(ibm['JobRole'])\n",
"ibm.insert(23, 'JR_HealthcareRepresentive', dummy['Healthcare Representative'])\n",
"ibm.insert(24, 'JR_HumanResource', dummy['Human Resources'])\n",
"ibm.insert(25, 'JR_LaboratoryTechnician', dummy['Laboratory Technician'])\n",
"ibm.insert(26, 'JR_Manager', dummy['Manager'])\n",
"ibm.insert(27, 'JR_ManufacturingDirector', dummy['Manufacturing Director'])\n",
"ibm.insert(28, 'JR_ResearchDirector', dummy['Research Director'])\n",
"ibm.insert(29, 'JR_ResearchScientist', dummy['Research Scientist'])\n",
"ibm.insert(30, 'JR_SalesExecutive', dummy['Sales Executive'])\n",
"ibm.insert(31, 'JR_SalesRepresentative', dummy['Sales Representative'])\n",
"\n",
"ibm.drop(columns = 'JobRole', inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# MaritalStatus role dummy variables\n",
"dummy=pd.get_dummies(ibm['MaritalStatus'])\n",
"ibm.insert(34, 'MS_Married', dummy['Married'])\n",
"ibm.insert(35, 'MS_Single', dummy['Single'])\n",
"ibm.insert(36, 'MS_Divorced', dummy['Divorced'])\n",
"\n",
"ibm.drop(columns = 'MaritalStatus', inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"# replace Overtime (0 - No; 1 - Yes)\n",
"ibm.replace({'OverTime': {'No': 0, 'Yes': 1}}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# replace Over18 (0 - N; 1 - Y)\n",
"ibm.replace({'Over18': {'N': 0, 'Y': 1}}, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"def iqr_outliers(data):\n",
" out=[]\n",
" \n",
" firstQuartile = data.quantile(0.25)\n",
" thirdQuartile = data.quantile(0.75)\n",
" \n",
" iqr = thirdQuartile-firstQuartile\n",
" \n",
" Lower_bound = firstQuartile - 1.5 * iqr\n",
" Upper_bound = thirdQuartile + 1.5 * iqr\n",
" \n",
" for i in data:\n",
" if i > Upper_bound or i < Lower_bound:\n",
" out.append(i)\n",
" \n",
" print(\"Outliers:\",out , \"\\nCount: \", len(out), \"\\n\")\n",
" return out"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Age\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"Attrition\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 237 \n",
"\n",
"BusinessTravel\n",
"Outliers: [2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0, 0, 2, 2, 0, 0, 2, 2, 0, 2, 2] \n",
"Count: 427 \n",
"\n",
"DailyRate\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"Dp_Sales&Development\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"Dp_Sales\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"Dp_HumanResources\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 63 \n",
"\n",
"DistanceFromHome\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"Education\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"EnvironmentSatisfaction\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"EF_Life Sciences\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"EF_Medical\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"EF_Marketing\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 159 \n",
"\n",
"EF_TechnicalDegree\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 132 \n",
"\n",
"EF_HumanResources\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 27 \n",
"\n",
"EF_Other\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 82 \n",
"\n",
"Gender\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"HourlyRate\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"JobInvolvement\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"JobLevel\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"JobSatisfaction\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"JR_HealthcareRepresentive\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 131 \n",
"\n",
"JR_HumanResource\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 52 \n",
"\n",
"JR_LaboratoryTechnician\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 259 \n",
"\n",
"JR_Manager\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 102 \n",
"\n",
"JR_ManufacturingDirector\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 145 \n",
"\n",
"JR_ResearchDirector\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 80 \n",
"\n",
"JR_ResearchScientist\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 292 \n",
"\n",
"JR_SalesExecutive\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 326 \n",
"\n",
"JR_SalesRepresentative\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 83 \n",
"\n",
"MonthlyIncome\n",
"Outliers: [19094, 18947, 19545, 18740, 18844, 18172, 17328, 16959, 19537, 17181, 19926, 19033, 18722, 19999, 16792, 19232, 19517, 19068, 19202, 19436, 16872, 19045, 19144, 17584, 18665, 17068, 19272, 18300, 16659, 19406, 19197, 19566, 18041, 17046, 17861, 16835, 16595, 19502, 18200, 16627, 19513, 19141, 19189, 16856, 19859, 18430, 17639, 16752, 19246, 17159, 17924, 17099, 17444, 17399, 19419, 18303, 19973, 19845, 17650, 19237, 19627, 16756, 17665, 16885, 17465, 19626, 19943, 18606, 17048, 17856, 19081, 17779, 19740, 18711, 18265, 18213, 18824, 18789, 19847, 19190, 18061, 17123, 16880, 17861, 19187, 19717, 16799, 17328, 19701, 17169, 16598, 17007, 16606, 19586, 19331, 19613, 17567, 19049, 19658, 17426, 17603, 16704, 19833, 19038, 19328, 19392, 19665, 16823, 17174, 17875, 19161, 19636, 19431, 18880] \n",
"Count: 114 \n",
"\n",
"MonthlyRate\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"NumCompaniesWorked\n",
"Outliers: [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9] \n",
"Count: 52 \n",
"\n",
"MS_Married\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"MS_Single\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"MS_Divorced\n",
"Outliers: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] \n",
"Count: 327 \n",
"\n",
"OverTime\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"PercentSalaryHike\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"PerformanceRating\n",
"Outliers: [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4] \n",
"Count: 226 \n",
"\n",
"RelationshipSatisfaction\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"StockOptionLevel\n",
"Outliers: [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] \n",
"Count: 85 \n",
"\n",
"TotalWorkingYears\n",
"Outliers: [31, 29, 37, 38, 30, 40, 36, 34, 32, 33, 37, 30, 36, 31, 33, 32, 37, 31, 32, 32, 30, 34, 30, 40, 29, 35, 31, 33, 31, 29, 32, 30, 33, 30, 29, 31, 32, 33, 36, 34, 31, 36, 33, 31, 29, 33, 29, 32, 31, 35, 29, 32, 34, 36, 32, 30, 36, 29, 34, 37, 29, 29, 35] \n",
"Count: 63 \n",
"\n",
"TrainingTimesLastYear\n",
"Outliers: [0, 5, 5, 5, 6, 5, 5, 5, 6, 6, 0, 0, 0, 5, 0, 5, 5, 5, 6, 6, 5, 0, 6, 5, 5, 0, 5, 5, 6, 5, 5, 5, 0, 5, 5, 5, 5, 6, 6, 5, 5, 5, 5, 0, 0, 5, 5, 5, 6, 6, 5, 0, 5, 0, 5, 5, 0, 6, 0, 5, 5, 6, 6, 5, 6, 5, 0, 5, 5, 5, 5, 0, 6, 5, 5, 5, 5, 6, 5, 5, 6, 5, 5, 5, 0, 5, 0, 5, 5, 6, 5, 6, 5, 0, 5, 5, 0, 6, 6, 5, 6, 0, 5, 0, 6, 6, 6, 6, 5, 5, 0, 5, 0, 0, 6, 0, 6, 5, 6, 5, 5, 0, 5, 6, 6, 5, 5, 0, 0, 6, 0, 0, 5, 0, 5, 6, 5, 5, 6, 6, 5, 5, 5, 5, 5, 6, 5, 6, 6, 0, 6, 6, 5, 5, 0, 0, 6, 6, 0, 5, 0, 0, 0, 0, 0, 5, 5, 6, 5, 5, 0, 5, 5, 0, 5, 5, 6, 5, 5, 5, 6, 5, 5, 5, 0, 0, 5, 5, 5, 5, 6, 0, 0, 6, 6, 6, 6, 5, 5, 5, 6, 5, 0, 5, 5, 6, 5, 6, 6, 5, 6, 6, 5, 0, 5, 5, 5, 5, 5, 0, 0, 0, 6, 5, 6, 6, 5, 6, 0, 6, 6, 5, 6, 6, 5, 5, 5, 0] \n",
"Count: 238 \n",
"\n",
"WorkLifeBalance\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n",
"YearsAtCompany\n",
"Outliers: [25, 22, 22, 27, 21, 22, 37, 25, 20, 40, 20, 24, 20, 24, 33, 20, 19, 22, 33, 24, 19, 21, 20, 36, 20, 20, 22, 24, 21, 21, 25, 21, 29, 20, 27, 20, 31, 32, 20, 20, 21, 22, 22, 34, 24, 26, 31, 20, 31, 26, 19, 21, 21, 32, 21, 19, 20, 22, 20, 21, 26, 20, 22, 24, 33, 29, 25, 21, 19, 19, 20, 19, 33, 19, 19, 20, 20, 20, 20, 20, 32, 20, 21, 33, 36, 26, 30, 22, 23, 23, 21, 21, 22, 22, 19, 22, 19, 22, 20, 20, 20, 22, 20, 20] \n",
"Count: 104 \n",
"\n",
"YearsInCurrentRole\n",
"Outliers: [15, 16, 18, 15, 18, 17, 16, 15, 16, 15, 16, 16, 15, 16, 17, 15, 15, 15, 17, 17, 16] \n",
"Count: 21 \n",
"\n",
"YearsSinceLastPromotion\n",
"Outliers: [8, 15, 8, 8, 9, 13, 12, 10, 11, 9, 12, 15, 15, 15, 9, 11, 11, 9, 12, 11, 15, 11, 10, 9, 11, 9, 8, 11, 11, 8, 13, 9, 9, 12, 10, 11, 15, 13, 9, 11, 10, 8, 8, 11, 9, 11, 12, 11, 14, 13, 14, 8, 11, 15, 10, 11, 11, 15, 11, 13, 11, 13, 15, 8, 13, 15, 11, 14, 15, 15, 9, 11, 9, 8, 9, 15, 11, 12, 9, 8, 10, 14, 8, 13, 13, 12, 14, 8, 8, 8, 14, 14, 8, 12, 13, 14, 14, 12, 11, 8, 11, 9, 12, 8, 9, 11, 9] \n",
"Count: 107 \n",
"\n",
"YearsWithCurrManager\n",
"Outliers: [17, 15, 15, 15, 15, 17, 16, 17, 15, 17, 17, 17, 17, 16] \n",
"Count: 14 \n",
"\n"
]
}
],
"source": [
"for c_name in ibm.columns:\n",
" print (c_name)\n",
" iqr_outliers(ibm[c_name])"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"def remove_outliers(c_name):\n",
" outliers = iqr_outliers(ibm[c_name])\n",
"\n",
" while (len(outliers)!=0):\n",
" for i in outliers:\n",
" ibm.drop(ibm.loc[ibm[c_name]==i].index, inplace = True)\n",
" outliers = iqr_outliers(ibm[c_name])\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Outliers: [19094, 18947, 19545, 18740, 18844, 18172, 17328, 16959, 19537, 17181, 19926, 19033, 18722, 19999, 16792, 19232, 19517, 19068, 19202, 19436, 16872, 19045, 19144, 17584, 18665, 17068, 19272, 18300, 16659, 19406, 19197, 19566, 18041, 17046, 17861, 16835, 16595, 19502, 18200, 16627, 19513, 19141, 19189, 16856, 19859, 18430, 17639, 16752, 19246, 17159, 17924, 17099, 17444, 17399, 19419, 18303, 19973, 19845, 17650, 19237, 19627, 16756, 17665, 16885, 17465, 19626, 19943, 18606, 17048, 17856, 19081, 17779, 19740, 18711, 18265, 18213, 18824, 18789, 19847, 19190, 18061, 17123, 16880, 17861, 19187, 19717, 16799, 17328, 19701, 17169, 16598, 17007, 16606, 19586, 19331, 19613, 17567, 19049, 19658, 17426, 17603, 16704, 19833, 19038, 19328, 19392, 19665, 16823, 17174, 17875, 19161, 19636, 19431, 18880] \n",
"Count: 114 \n",
"\n",
"Outliers: [15427, 13458, 14756, 13245, 13664, 13503, 13549, 13872, 13734, 13591, 16064, 13675, 13496, 13603, 13525, 16015, 13964, 15992, 14336, 13212, 16555, 14118, 13610, 13237, 16184, 15402, 14814, 13770, 16307, 13826, 14275, 13582, 14852, 13194, 13973, 13726, 13320, 13120, 13499, 13758, 13191, 16124, 13577, 14026, 13142, 13695, 13402, 13247, 14732, 16422, 13757, 16032, 16328, 14411, 16437, 15202, 16413, 13269, 13966, 15972, 15379, 12936, 12965, 13116, 13464, 16291, 15787, 13225, 13348, 13341, 13206, 13744, 13570] \n",
"Count: 73 \n",
"\n",
"Outliers: [11994, 12490, 12185, 11849, 11996, 12061, 11878, 12504, 11935, 12808, 11836, 12742, 11904, 12169, 11916, 11957, 12031] \n",
"Count: 17 \n",
"\n",
"Outliers: [11713, 11691] \n",
"Count: 2 \n",
"\n",
"Outliers: [11631] \n",
"Count: 1 \n",
"\n",
"Outliers: [] \n",
"Count: 0 \n",
"\n"
]
}
],
"source": [
"remove_outliers('MonthlyIncome')"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Age</th>\n",
" <th>Attrition</th>\n",
" <th>BusinessTravel</th>\n",
" <th>DailyRate</th>\n",
" <th>Dp_Sales&Development</th>\n",
" <th>Dp_Sales</th>\n",
" <th>Dp_HumanResources</th>\n",
" <th>DistanceFromHome</th>\n",
" <th>Education</th>\n",
" <th>EnvironmentSatisfaction</th>\n",
" <th>EF_Life Sciences</th>\n",
" <th>EF_Medical</th>\n",
" <th>EF_Marketing</th>\n",
" <th>EF_TechnicalDegree</th>\n",
" <th>EF_HumanResources</th>\n",
" <th>EF_Other</th>\n",
" <th>Gender</th>\n",
" <th>HourlyRate</th>\n",
" <th>JobInvolvement</th>\n",
" <th>JobLevel</th>\n",
" <th>JobSatisfaction</th>\n",
" <th>JR_HealthcareRepresentive</th>\n",
" <th>JR_HumanResource</th>\n",
" <th>JR_LaboratoryTechnician</th>\n",
" <th>JR_Manager</th>\n",
" <th>JR_ManufacturingDirector</th>\n",
" <th>JR_ResearchDirector</th>\n",
" <th>JR_ResearchScientist</th>\n",
" <th>JR_SalesExecutive</th>\n",
" <th>JR_SalesRepresentative</th>\n",
" <th>MonthlyIncome</th>\n",
" <th>MonthlyRate</th>\n",
" <th>NumCompaniesWorked</th>\n",
" <th>MS_Married</th>\n",
" <th>MS_Single</th>\n",
" <th>MS_Divorced</th>\n",
" <th>OverTime</th>\n",
" <th>PercentSalaryHike</th>\n",
" <th>PerformanceRating</th>\n",
" <th>RelationshipSatisfaction</th>\n",
" <th>StockOptionLevel</th>\n",
" <th>TotalWorkingYears</th>\n",
" <th>TrainingTimesLastYear</th>\n",
" <th>WorkLifeBalance</th>\n",
" <th>YearsAtCompany</th>\n",
" <th>YearsInCurrentRole</th>\n",
" <th>YearsSinceLastPromotion</th>\n",
" <th>YearsWithCurrManager</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>41</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1102</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>94</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>5993</td>\n",
" <td>19479</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>49</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>279</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>61</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5130</td>\n",
" <td>24907</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>23</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>10</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>37</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1373</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>92</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2090</td>\n",
" <td>2396</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>15</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>33</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1392</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>56</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2909</td>\n",
" <td>23159</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>11</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>8</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>27</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>591</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>40</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3468</td>\n",
" <td>16632</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1465</th>\n",
" <td>36</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>884</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>23</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>41</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2571</td>\n",
" <td>12290</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1466</th>\n",
" <td>39</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>613</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>42</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>9991</td>\n",
" <td>21457</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>15</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1467</th>\n",
" <td>27</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>155</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>87</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>6142</td>\n",
" <td>5174</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>20</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1468</th>\n",
" <td>49</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1023</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>63</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>5390</td>\n",
" <td>13243</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>14</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>17</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1469</th>\n",
" <td>34</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>628</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>82</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4404</td>\n",
" <td>10228</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1263 rows × 48 columns</p>\n",
"</div>"
],
"text/plain": [
" Age Attrition BusinessTravel DailyRate Dp_Sales&Development \\\n",
"0 41 1 1 1102 0 \n",
"1 49 0 2 279 1 \n",
"2 37 1 1 1373 1 \n",
"3 33 0 2 1392 1 \n",
"4 27 0 1 591 1 \n",
"... ... ... ... ... ... \n",
"1465 36 0 2 884 1 \n",
"1466 39 0 1 613 1 \n",
"1467 27 0 1 155 1 \n",
"1468 49 0 2 1023 0 \n",
"1469 34 0 1 628 1 \n",
"\n",
" Dp_Sales Dp_HumanResources DistanceFromHome Education \\\n",
"0 1 0 1 2 \n",
"1 0 0 8 1 \n",
"2 0 0 2 2 \n",
"3 0 0 3 4 \n",
"4 0 0 2 1 \n",
"... ... ... ... ... \n",
"1465 0 0 23 2 \n",
"1466 0 0 6 1 \n",
"1467 0 0 4 3 \n",
"1468 1 0 2 3 \n",
"1469 0 0 8 3 \n",
"\n",
" EnvironmentSatisfaction EF_Life Sciences EF_Medical EF_Marketing \\\n",
"0 2 1 0 0 \n",
"1 3 1 0 0 \n",
"2 4 0 0 0 \n",
"3 4 1 0 0 \n",
"4 1 0 1 0 \n",
"... ... ... ... ... \n",
"1465 3 0 1 0 \n",
"1466 4 0 1 0 \n",
"1467 2 1 0 0 \n",
"1468 4 0 1 0 \n",
"1469 2 0 1 0 \n",
"\n",
" EF_TechnicalDegree EF_HumanResources EF_Other Gender HourlyRate \\\n",
"0 0 0 0 1 94 \n",
"1 0 0 0 0 61 \n",
"2 0 0 1 0 92 \n",
"3 0 0 0 1 56 \n",
"4 0 0 0 0 40 \n",
"... ... ... ... ... ... \n",
"1465 0 0 0 0 41 \n",
"1466 0 0 0 0 42 \n",
"1467 0 0 0 0 87 \n",
"1468 0 0 0 0 63 \n",
"1469 0 0 0 0 82 \n",
"\n",
" JobInvolvement JobLevel JobSatisfaction JR_HealthcareRepresentive \\\n",
"0 3 2 4 0 \n",
"1 2 2 2 0 \n",
"2 2 1 3 0 \n",
"3 3 1 3 0 \n",
"4 3 1 2 0 \n",
"... ... ... ... ... \n",
"1465 4 2 4 0 \n",
"1466 2 3 1 1 \n",
"1467 4 2 2 0 \n",
"1468 2 2 2 0 \n",
"1469 4 2 3 0 \n",
"\n",
" JR_HumanResource JR_LaboratoryTechnician JR_Manager \\\n",
"0 0 0 0 \n",
"1 0 0 0 \n",
"2 0 1 0 \n",
"3 0 0 0 \n",
"4 0 1 0 \n",
"... ... ... ... \n",
"1465 0 1 0 \n",
"1466 0 0 0 \n",
"1467 0 0 0 \n",
"1468 0 0 0 \n",
"1469 0 1 0 \n",
"\n",
" JR_ManufacturingDirector JR_ResearchDirector JR_ResearchScientist \\\n",
"0 0 0 0 \n",
"1 0 0 1 \n",
"2 0 0 0 \n",
"3 0 0 1 \n",
"4 0 0 0 \n",
"... ... ... ... \n",
"1465 0 0 0 \n",
"1466 0 0 0 \n",
"1467 1 0 0 \n",
"1468 0 0 0 \n",
"1469 0 0 0 \n",
"\n",
" JR_SalesExecutive JR_SalesRepresentative MonthlyIncome MonthlyRate \\\n",
"0 1 0 5993 19479 \n",
"1 0 0 5130 24907 \n",
"2 0 0 2090 2396 \n",
"3 0 0 2909 23159 \n",
"4 0 0 3468 16632 \n",
"... ... ... ... ... \n",
"1465 0 0 2571 12290 \n",
"1466 0 0 9991 21457 \n",
"1467 0 0 6142 5174 \n",
"1468 1 0 5390 13243 \n",
"1469 0 0 4404 10228 \n",
"\n",
" NumCompaniesWorked MS_Married MS_Single MS_Divorced OverTime \\\n",
"0 8 0 1 0 1 \n",
"1 1 1 0 0 0 \n",
"2 6 0 1 0 1 \n",
"3 1 1 0 0 1 \n",
"4 9 1 0 0 0 \n",
"... ... ... ... ... ... \n",
"1465 4 1 0 0 0 \n",
"1466 4 1 0 0 0 \n",
"1467 1 1 0 0 1 \n",
"1468 2 1 0 0 0 \n",
"1469 2 1 0 0 0 \n",
"\n",
" PercentSalaryHike PerformanceRating RelationshipSatisfaction \\\n",
"0 11 3 1 \n",
"1 23 4 4 \n",
"2 15 3 2 \n",
"3 11 3 3 \n",
"4 12 3 4 \n",
"... ... ... ... \n",
"1465 17 3 3 \n",
"1466 15 3 1 \n",
"1467 20 4 2 \n",
"1468 14 3 4 \n",
"1469 12 3 1 \n",
"\n",
" StockOptionLevel TotalWorkingYears TrainingTimesLastYear \\\n",
"0 0 8 0 \n",
"1 1 10 3 \n",
"2 0 7 3 \n",
"3 0 8 3 \n",
"4 1 6 3 \n",
"... ... ... ... \n",
"1465 1 17 3 \n",
"1466 1 9 5 \n",
"1467 1 6 0 \n",
"1468 0 17 3 \n",
"1469 0 6 3 \n",
"\n",
" WorkLifeBalance YearsAtCompany YearsInCurrentRole \\\n",
"0 1 6 4 \n",
"1 3 10 7 \n",
"2 3 0 0 \n",
"3 3 8 7 \n",
"4 3 2 2 \n",
"... ... ... ... \n",
"1465 3 5 2 \n",
"1466 3 7 7 \n",
"1467 3 6 2 \n",
"1468 2 9 6 \n",
"1469 4 4 3 \n",
"\n",
" YearsSinceLastPromotion YearsWithCurrManager \n",
"0 0 5 \n",
"1 1 7 \n",
"2 0 0 \n",
"3 3 0 \n",
"4 2 2 \n",
"... ... ... \n",
"1465 0 3 \n",
"1466 1 7 \n",
"1467 0 3 \n",
"1468 0 8 \n",
"1469 1 2 \n",
"\n",
"[1263 rows x 48 columns]"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ibm"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Classification"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Support Vector Machine (prepared by Teh Liang Sean) "
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"# import important library to do SVM\n",
"from sklearn import svm\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import metrics"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"#The target for SVM will be the attrition of IBM employees to know whether the employees will continue stay or leave IBM\n",
"x_svm_find = ibm.drop(columns = 'Attrition')\n",
"y_svm = ibm['Attrition']"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Features Score\n",
"29 MonthlyIncome 26471.159476\n",
"30 MonthlyRate 1308.443569\n",
"2 DailyRate 1111.594737\n",
"44 YearsInCurrentRole 109.263859\n",
"43 YearsAtCompany 103.805057\n",
"46 YearsWithCurrManager 100.636711\n",
"40 TotalWorkingYears 95.843571\n",
"35 OverTime 60.367656\n",
"6 DistanceFromHome 57.197704\n",
"0 Age 46.705340\n",
"28 JR_SalesRepresentative 27.299127\n",
"33 MS_Single 26.251695\n",
"39 StockOptionLevel 24.376114\n",
"20 JR_HealthcareRepresentive 10.935616\n",
"24 JR_ManufacturingDirector 9.987076\n"
]
}
],
"source": [
"# Try use SelectKBest and chi-squared (chi²) statistical test for non-negative feature to find top 15 best features\n",
"#Import library\n",
"from sklearn.feature_selection import SelectKBest\n",
"from sklearn.feature_selection import chi2\n",
"#Use SelectKBest class to find top 15 best features\n",
"best_15_features = SelectKBest(score_func=chi2, k=15)\n",
"fit = best_15_features.fit(x_svm_find,y_svm)\n",
"dfscores = pd.DataFrame(fit.scores_)\n",
"dfcolumns = pd.DataFrame(x_svm_find.columns)\n",
"#Try to concat two dataframes for a better visualization \n",
"top_15_feature_scores = pd.concat([dfcolumns,dfscores],axis=1)\n",
"#Name the dataframe columns\n",
"top_15_feature_scores.columns = ['Features','Score'] \n",
"#Show 15 best features\n",
"print(top_15_feature_scores.nlargest(15,'Score')) "
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"ibm_svm_features_df = pd.DataFrame()\n",
"# Set up data to do SVM using top 15 best features identified\n",
"ibm_svm_features_df.insert(0,'MonthlyIncome',ibm['MonthlyIncome'])\n",
"ibm_svm_features_df.insert(1,'MonthlyRate',ibm['MonthlyRate'])\n",
"ibm_svm_features_df.insert(2,'DailyRate',ibm['DailyRate'])\n",
"ibm_svm_features_df.insert(3,'YearsInCurrentRole',ibm['YearsInCurrentRole'])\n",
"ibm_svm_features_df.insert(4,'YearsAtCompany',ibm['YearsAtCompany'])\n",
"ibm_svm_features_df.insert(5,'YearsWithCurrManager',ibm['YearsWithCurrManager'])\n",
"ibm_svm_features_df.insert(6,'TotalWorkingYears',ibm['TotalWorkingYears'])\n",
"ibm_svm_features_df.insert(7,'OverTime',ibm['OverTime'])\n",
"ibm_svm_features_df.insert(8,'DistanceFromHome',ibm['DistanceFromHome'])\n",
"ibm_svm_features_df.insert(9,'Age',ibm['Age'])\n",
"ibm_svm_features_df.insert(10,'JR_SalesRepresentative',ibm['JR_SalesRepresentative'])\n",
"ibm_svm_features_df.insert(11,'MS_Single',ibm['MS_Single'])\n",
"ibm_svm_features_df.insert(12,'StockOptionLevel',ibm['StockOptionLevel'])\n",
"ibm_svm_features_df.insert(13,'JR_HealthcareRepresentive ',ibm['JR_HealthcareRepresentive'])\n",
"ibm_svm_features_df.insert(14,'JR_ManufacturingDirector',ibm['JR_ManufacturingDirector'])"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>MonthlyIncome</th>\n",
" <th>MonthlyRate</th>\n",
" <th>DailyRate</th>\n",
" <th>YearsInCurrentRole</th>\n",
" <th>YearsAtCompany</th>\n",
" <th>YearsWithCurrManager</th>\n",
" <th>TotalWorkingYears</th>\n",
" <th>OverTime</th>\n",
" <th>DistanceFromHome</th>\n",
" <th>Age</th>\n",
" <th>JR_SalesRepresentative</th>\n",
" <th>MS_Single</th>\n",
" <th>StockOptionLevel</th>\n",
" <th>JR_HealthcareRepresentive</th>\n",
" <th>JR_ManufacturingDirector</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5993</td>\n",
" <td>19479</td>\n",
" <td>1102</td>\n",
" <td>4</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>41</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>5130</td>\n",
" <td>24907</td>\n",
" <td>279</td>\n",
" <td>7</td>\n",
" <td>10</td>\n",
" <td>7</td>\n",
" <td>10</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>49</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2090</td>\n",
" <td>2396</td>\n",
" <td>1373</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>7</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>37</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2909</td>\n",
" <td>23159</td>\n",
" <td>1392</td>\n",
" <td>7</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>33</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3468</td>\n",
" <td>16632</td>\n",
" <td>591</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>27</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1465</th>\n",
" <td>2571</td>\n",
" <td>12290</td>\n",
" <td>884</td>\n",
" <td>2</td>\n",
" <td>5</td>\n",
" <td>3</td>\n",
" <td>17</td>\n",
" <td>0</td>\n",
" <td>23</td>\n",
" <td>36</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1466</th>\n",
" <td>9991</td>\n",
" <td>21457</td>\n",
" <td>613</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>39</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1467</th>\n",
" <td>6142</td>\n",
" <td>5174</td>\n",
" <td>155</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>3</td>\n",
" <td>6</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>27</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1468</th>\n",
" <td>5390</td>\n",
" <td>13243</td>\n",
" <td>1023</td>\n",
" <td>6</td>\n",
" <td>9</td>\n",
" <td>8</td>\n",
" <td>17</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>49</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1469</th>\n",
" <td>4404</td>\n",
" <td>10228</td>\n",
" <td>628</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" <td>34</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1263 rows × 15 columns</p>\n",
"</div>"
],
"text/plain": [
" MonthlyIncome MonthlyRate DailyRate YearsInCurrentRole \\\n",
"0 5993 19479 1102 4 \n",
"1 5130 24907 279 7 \n",
"2 2090 2396 1373 0 \n",
"3 2909 23159 1392 7 \n",
"4 3468 16632 591 2 \n",
"... ... ... ... ... \n",
"1465 2571 12290 884 2 \n",
"1466 9991 21457 613 7 \n",
"1467 6142 5174 155 2 \n",
"1468 5390 13243 1023 6 \n",
"1469 4404 10228 628 3 \n",
"\n",
" YearsAtCompany YearsWithCurrManager TotalWorkingYears OverTime \\\n",
"0 6 5 8 1 \n",
"1 10 7 10 0 \n",
"2 0 0 7 1 \n",
"3 8 0 8 1 \n",
"4 2 2 6 0 \n",
"... ... ... ... ... \n",
"1465 5 3 17 0 \n",
"1466 7 7 9 0 \n",
"1467 6 3 6 1 \n",
"1468 9 8 17 0 \n",
"1469 4 2 6 0 \n",
"\n",
" DistanceFromHome Age JR_SalesRepresentative MS_Single \\\n",
"0 1 41 0 1 \n",
"1 8 49 0 0 \n",
"2 2 37 0 1 \n",
"3 3 33 0 0 \n",
"4 2 27 0 0 \n",
"... ... ... ... ... \n",
"1465 23 36 0 0 \n",
"1466 6 39 0 0 \n",
"1467 4 27 0 0 \n",
"1468 2 49 0 0 \n",
"1469 8 34 0 0 \n",
"\n",
" StockOptionLevel JR_HealthcareRepresentive JR_ManufacturingDirector \n",
"0 0 0 0 \n",
"1 1 0 0 \n",
"2 0 0 0 \n",
"3 0 0 0 \n",
"4 1 0 0 \n",
"... ... ... ... \n",
"1465 1 0 0 \n",
"1466 1 1 0 \n",
"1467 1 0 1 \n",
"1468 0 0 0 \n",
"1469 0 0 0 \n",
"\n",
"[1263 rows x 15 columns]"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ibm_svm_features_df"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"#assignment ibm_svm_features to x\n",
"x_svm = ibm_svm_features_df"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"#Try to scale all the numeric data of each features to make svm model train more effective\n",
"from sklearn.preprocessing import StandardScaler\n",
"s_scaler = StandardScaler()\n",
"x_scaled_svm = s_scaler.fit_transform(x_svm)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"#Try to use tomek link to solve undersampling problem as attriction too few 'yes' value for imbalanced classification \n",
"from imblearn.under_sampling import TomekLinks\n",
"\n",
"tl_svm = TomekLinks(sampling_strategy='not minority')\n",
"x_tl_svm, y_tl_svm= tl_svm.fit_resample(x_svm, y_svm)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"#Train the modals with 80% and test 20% of the data\n",
"x_train_svm, x_test_svm, y_train_svm, y_test_svm = train_test_split(x_tl_svm,y_tl_svm, test_size=0.2,random_state=40, stratify=y_tl_svm)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"# Model 1 is using the manual tuning for some hyperparameters of SVM\n",
"model_1_svm=svm.SVC(C=2,kernel='sigmoid',gamma='scale',coef0=0.6,random_state=40,probability=True)\n",
"model_1_svm.fit(x_train_svm,y_train_svm)\n",
"y_predict_1_svm=model_1_svm.predict(x_test_svm)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Fitting 4 folds for each of 5400 candidates, totalling 21600 fits\n"
]
}
],
"source": [
"# Modal 2 is using GridSearchCV to find the best hyperparameters for SVM using cross validation\n",
"# Only some hyperparameters are tuned \n",
"\n",
"# import GridSearchCV library\n",
"from sklearn.model_selection import GridSearchCV\n",
"\n",
"#Try to tune the hyperparameter with\n",
"#kernel type: linear/rbf/sigmoid\n",
"#C which is the regularization parameter: range 0-1 increase by 0.1\n",
"#coef0 that is the independent term for kernel method (only for sigmoid): range 0.0-0.5 increase by 0.1\n",
"#degree for the polynomial ('poly') kernel method: range 0-5 increase by 1\n",
"#gamma that are kernel coefficient for 'rbf' and 'poly': scale/auto\n",
"\n",
"param_grid={'kernel':('linear','rbf','sigmoid'),\n",
" 'C':[i for i in np.arange(1.0,3.0,0.1)],\n",
" 'coef0':[y for y in np.arange(0.0,1.5,0.1)],\n",
" 'degree':[z for z in np.arange(3,6,1)],\n",
" 'gamma':('auto','scale'),}\n",
"# set random state to 40\n",
"find_best_para_model=svm.SVC(random_state=40)\n",
"Grid_search_svm=GridSearchCV(find_best_para_model,param_grid, n_jobs=-1,verbose=2,cv=4)\n",
"# this may take some time to run\n",
"Grid_search_svm.fit(x_train_svm,y_train_svm)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'C': 2.8000000000000016,\n",
" 'coef0': 0.0,\n",
" 'degree': 3,\n",
" 'gamma': 'scale',\n",
" 'kernel': 'rbf'}"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Show the best hyperparameter found by grid search\n",
"Grid_search_svm.best_params_"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Use hyperparameter found grid search to build modal \n",
"model_2_svm=svm.SVC(C=2.8000000000000016,kernel='rbf',degree=3,gamma='scale',coef0=0.0,probability=True,random_state=40)\n",
"model_2_svm.fit(x_train_svm,y_train_svm)\n",
"y_predict_2_svm=model_2_svm.predict(x_test_svm)"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy of prediction classification result for 2 model\n",
"Hyperparameters that try to tune manually (model 1): 0.7416666666666667\n",
"Best hyperparameters found using GridSearchCV (model 2): 0.8166666666666667\n"
]
}
],
"source": [
"#Evaluate accurracy of classification result\n",
"print('Accuracy of prediction classification result for 2 model')\n",
"print('Hyperparameters that try to tune manually (model 1): ',metrics.accuracy_score(y_test_svm, y_predict_1_svm))\n",
"print('Best hyperparameters found using GridSearchCV (model 2): ',metrics.accuracy_score(y_test_svm, y_predict_2_svm)) "
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[183 12]\n",
" [ 32 13]]\n",
" precision recall f1-score support\n",
"\n",
" 0 0.85 0.94 0.89 195\n",
" 1 0.52 0.29 0.37 45\n",
"\n",
" accuracy 0.82 240\n",
" macro avg 0.69 0.61 0.63 240\n",
"weighted avg 0.79 0.82 0.79 240\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\USER\\anaconda3\\lib\\site-packages\\sklearn\\utils\\validation.py:70: FutureWarning: Pass labels=[0, 1] as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error\n",
" warnings.warn(f\"Pass {args_msg} as keyword args. From version \"\n"
]
}
],
"source": [
"#Evaluating classification result by confusion matrix\n",
"from sklearn.metrics import confusion_matrix\n",
"print (confusion_matrix(y_test_svm, y_predict_2_svm,[0,1]))\n",
"\n",
"#Evaluating classification result by Precision, Recall and F1-Measure\n",
"from sklearn.metrics import classification_report\n",
"print (classification_report(y_test_svm, y_predict_2_svm))"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"from matplotlib import pyplot as plt\n",
"#Evaluating classification result by ROC curves\n",
"from sklearn.metrics import roc_curve\n",
"y_pred_prob_svm = model_2_svm.predict_proba(x_test_svm)[:,1]\n",
"fpr, tpr, threshold = roc_curve(y_test_svm, y_pred_prob_svm)\n",
"plt.plot([0, 1], [0, 1], 'k--')\n",
"plt.plot(fpr,tpr)\n",
"auc = roc_auc_score(y_test_svm, y_pred_prob_svm)\n",
"plt.title(f'AUC: {auc}')\n",
"plt.xlabel('False Positive Rate')\n",
"plt.ylabel('True Positive Rate')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}