# Inputs: x_train, y_train, x_test, y_test. # Fit the model. from sklearn.linear_model import BayesianRidge model = BayesianRidge().fit(x_train, y_train) # Get predictions. y_predict = model.predict(x_test) # Get the results. from sklearn.metrics import mean_squared_error, r2_score mean_squared_error = mean_squared_error(y_test, y_predict) r2_score = r2_score(y_test, y_predict)
Working End-to-End Example
# Step 1: Import the libraries. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ import numpy as np import pandas as pd from sklearn.linear_model import BayesianRidge from sklearn.metrics import mean_squared_error, r2_score from sklearn.model_selection import train_test_split # Step 2: Set up the constants. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # The target feature is the price at which the house sold. TARGET_FEATURE = 'price' # We'll set aside 20% of the data to test the model. TEST_SET_SIZE = 0.2 # There are some columns we won't use in this model. FEATURES_TO_REMOVE = ['id', 'date'] # We need to know which features are categorical. CATEGORICAL_FEATURES = ['waterfront', 'condition', 'zipcode'] # Step 3: Load in the raw data. # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # This assumes the data is in the same directory as this script. # Here we load the data into a pandas DataFrame. raw_data = pd.read_csv('kc_house_data.csv') # It's helpful to take a quick look at the data. print('Sample of loaded data:') print(raw_data.sample(5)) print('') target_data = raw_data[TARGET_FEATURE] for percent in [25, 50, 75]: label = 'Target Feature, %d%% Percentile:' % percent print(label, np.percentile(target_data, percent)) print('') # Step 4: Set up the data. # ~~~~~~~~~~~~~~~~~~~~~~~~ # Separate the X and Y values. y_data = raw_data[TARGET_FEATURE] # Using drop() doesn't change raw_data, only the return value. # The axis=1 keyword tells pandas to drop a column (not a row). x_data = raw_data.drop(TARGET_FEATURE, axis=1) # Remove the unused features. x_data = x_data.drop(FEATURES_TO_REMOVE, axis=1) # To include an intercept, add a new column with a constant. x_data['intercept'] = 1.0 # Turn categorical variables into dummy columns (0 or 1 values). # Do this to avoid assuming a meaningful order of categories. # Use drop_first to avoid multicollinearity among features. x_data = pd.get_dummies( x_data, columns=CATEGORICAL_FEATURES, drop_first=True ) # It's helpful to double check that the final data looks good. print('Sample of data to use:') print(x_data.sample(5)) print('') # Split the data into training and test sets. x_train, x_test, y_train, y_test = train_test_split( x_data, y_data, test_size=TEST_SET_SIZE ) # Step 5: Fit the model. # ~~~~~~~~~~~~~~~~~~~~~~ model = BayesianRidge().fit(x_train, y_train) # Yes, that's it! # Step 6: Get the results. # ~~~~~~~~~~~~~~~~~~~~~~~~ # Get predictions for the test set. y_predict = model.predict(x_test) mean_squared_error = mean_squared_error(y_test, y_predict) print('Mean Squared Error: %.2f' % mean_squared_error) r2_score = r2_score(y_test, y_predict) print('R2 Score: %.2f' % r2_score) # As a custom metric, we're curious to check how many of the # predictions were within 20% of the true prediction. percent_diff = np.abs(y_predict - y_test) / y_test result = (percent_diff < 0.2).sum() / len(y_test) print('Percent within 20%% of target value: %.2f' % result)
This classifier uses L2 regularization ("ridge regression") to avoid overfitting; it chooses the regularization coefficient based on the data itself.