Missing Data

Missing data is when no data value is stored for the variable in an observation. Wikipedia: Missing data

Diagnose Missing Data
# Step 1: Import the libraries and load in the raw data.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

import pandas as pd
raw_data = pd.read_csv('State_time_series.csv', header=0)


# Step 2: Take a look at how many missing values there are.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

columns_df = pd.DataFrame(
  raw_data.columns, columns=['column_name']
)
columns_df['total_nas'] = raw_data.isnull().sum().tolist()
columns_df = columns_df.sort_values(
  by=['total_nas'], 
  ascending=False
)

print('%s total rows' % len(raw_data))
print('%s total rows' % columns_df)
print(columns_df)
Remove Missing Data Rows
# Step 1: Import the libraries and load in the raw data.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

import pandas as pd
raw_data = pd.read_csv('State_time_series.csv', header=0)


# Step 2: Set up the constants.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# We can list the specific columns for which we want to remove
# rows with a missing value. In this case, we'll do this for
# all the columns.
COLUMNS_TO_DROP_MISSING_VALUES = raw_data.columns


# Step 3: Drop the rows with missing values.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

clean_data = raw_data.copy().dropna(
  subset=[COLUMNS_TO_DROP_MISSING_VALUES],
  axis=0
)

# It's helpful to take a look at the results.
print('%s clean rows remaining out of %s'
  % (len(clean_data), len(raw_data))
)
print('')
print('Clean data:')
print(clean_data[:5])
Fill Missing Data Rows With Constant
# Step 1: Import the libraries and load in the raw data.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

import pandas as pd
raw_data = pd.read_csv('State_time_series.csv', header=0)


# Step 2: Set up the constants.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# We can list the specific columns for which we want to remove
# rows with a missing value. In this case, we'll do this for
# all the columns.
COLUMNS_TO_FILL_WTIH_CONSTANT = raw_data.columns

# Choose the constant to fill missing values with.
FILL_CONSTANT = 0 


# Step 3: Fill the missing values.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

clean_data = raw_data.copy()
for column in raw_data.columns:
  clean_data[column] = clean_data[column].fillna(FILL_CONSTANT)

# It's helpful to take a look at the results.
print('Clean data:')
print(clean_data[:5])
Fill Missing Data Rows With Median
# Step 1: Import the libraries and load in the raw data.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

import pandas as pd
import numpy as np
raw_data = pd.read_csv('State_time_series.csv', header=0)


# Step 2: Set up the constants.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# We can list the specific columns for which we want to remove
# rows with a missing value. In this case, we'll do this for
# all the columns.
COLUMNS_TO_FILL_WTIH_CONSTANT = raw_data.columns


# Step 3: Fill the missing values.
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

clean_data = raw_data.copy()
for column in raw_data.columns:

  # We can only fill medians for columns with numeric values.
  if np.issubdtype(raw_data[column].dtype, np.number):

    # Get the median value and fill missing values with it.
    median_val = raw_data[column].median()
    print('Filling median %.2f for %s' % (median_val, column))
    clean_data[column] = clean_data[column].fillna(median_val)

# It's helpful to take a look at the results.
print('Clean data:')
print(clean_data[:5])

Notes

It's unlikely every column should be handled the same way; each column should be handled in a way that best suits the data in that column.