v Handling Outliers - Machine Learning

Handling Outliers

Preliminaries

# Load library
import pandas as pd

Create Data

# Create DataFrame
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

houses
Price Bathrooms Square_Feet
0 534433 2.0 1500
1 392333 3.5 2500
2 293222 2.0 1500
3 4322032 116.0 48000

Option 1: Drop

# Drop observations greater than some value
houses[houses['Bathrooms'] < 20]
Price Bathrooms Square_Feet
0 534433 2.0 1500
1 392333 3.5 2500
2 293222 2.0 1500

Option 2: Mark

# Load library
import numpy as np

# Create feature based on boolean condition
houses['Outlier'] = np.where(houses['Bathrooms'] < 20, 0, 1)

# Show data
houses
Price Bathrooms Square_Feet Outlier
0 534433 2.0 1500 0
1 392333 3.5 2500 0
2 293222 2.0 1500 0
3 4322032 116.0 48000 1

Option 3: Rescale

# Log feature
houses['Log_Of_Square_Feet'] = [np.log(x) for x in houses['Square_Feet']]

# Show data
houses
Price Bathrooms Square_Feet Outlier Log_Of_Square_Feet
0 534433 2.0 1500 0 7.313220
1 392333 3.5 2500 0 7.824046
2 293222 2.0 1500 0 7.313220
3 4322032 116.0 48000 1 10.778956