pytorch house price prediction (linear regression)

Article directory

  • I. Introduction
  • 2. Implementation method

1. Foreword

  1. Task objective: predict the final transaction price of the house based on the relevant data of the house attributes in csv
  2. Dataset: “Residential Attribute Dataset”, taken from https://download.csdn.net/download/weixin_43721000/87785277
  3. Explanation of dataset fields:
    There are field details in this file↓

2. Implementation method

# export package
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import os


# Clean data ----------------------------------------------- --------------------------
def clean_num(numeric_data):
    '''
    # Define the digit cleaning method
    :param numeric_data: data to be cleaned
    :return:
    '''

    numeric_data. describe()
    print(numeric_data. describe())
    # MSSubClass LotArea ... GarageYrBlt SalePrice
    # count 1460.000000 1460.000000 ... 1379.000000 1460.000000
    # mean 56.897260 10516.828082 ... 1978.506164 180921.195890
    # std 42.300571 9981.264932 ... 24.689725 79442.502883
    # min 20.000000 1300.000000 ... 1900.000000 34900.000000
    # 25% 20.000000 7553.500000 ... 1961.000000 129975.000000
    # 50% 50.000000 9478.500000 ... 1980.000000 163000.000000
    # 75% 70.000000 11601.500000 ... 2002.000000 214000.000000
    # max 190.000000 215245.000000 ... 2010.000000 755000.000000
    #
    # [8 rows x 37 columns]
    numeric_data.head(10)
    print(numeric_data. head(10))
    # MSSubClass LotArea OverallQual ... MasVnrArea GarageYrBlt SalePrice
    # 0 60 8450 7 ... 196.0 2003.0 208500
    # 1 20 9600 6 ... 0.0 1976.0 181500
    # 2 60 11250 7 ... 162.0 2001.0 223500
    # 3 70 9550 7 ... 0.0 1998.0 140000
    # 4 60 14260 8 ... 350.0 2000.0 250000
    # 5 50 14115 5 ... 0.0 1993.0 143000
    # 6 20 10084 8 ... 186.0 2004.0 307000
    # 7 60 10382 7 ... 240.0 1973.0 200000
    # 8 50 6120 7 ... 0.0 1931.0 129900
    # 9 190 7420 5 ... 0.0 1939.0 118000
    #
    # [10 rows x 37 columns]
    #
    # Process finished with exit code 0


    # Find columns containing nan values
    nan_columns = np. any(pd. isna(numeric_data), axis = 0)
    nan_columns = list(nan_columns[nan_columns == True]. index)

    # replace nan with 0
    for col in nan_columns:
        numeric_data[col] = numeric_data[col].fillna(0)


    return numeric_data


def clean_text(non_numeric_data):
    '''
    # Define the text cleaning method
    :param non_numeric_data: data to be cleaned
    :return:
    '''

    print(non_numeric_data. describe())
    # MSZoning Street Alley LotShape ... Fence MiscFeature SaleType SaleCondition
    # count 1460 1460 91 1460 ... 281 54 1460 1460
    # unique 5 2 2 4 ... 4 4 9 6
    # top RL Pave Grvl Reg ... MnPrv Shed WD Normal
    # freq 1151 1454 50 925 ... 157 49 1267 1198
    #
    # [4 rows x 43 columns]


    # Replace nan in all non-numeric columns with the string 'N/A'
    nan_columns = np.any(pd.isna(non_numeric_data), axis=0)
    nan_columns = list(nan_columns[nan_columns == True]. index)
    print(nan_columns)

    for col in nan_columns:
        non_numeric_data[col] = non_numeric_data[col].fillna('N/A')

    # # Check if there are any nan values
    # nan_columns = np. any(pd. isna(non_numeric_data), axis = 0)
    # nan_columns = list(nan_columns[nan_columns == True]. index)
    # print(nan_columns)
    ##[]


    # Replace all string labels with numbers
    mapping_table = dict()

    for col in non_numeric_columns:
        curr_mapping_table = dict()

        unique_values = pd. unique(non_numeric_data[col])
        for inx, v in enumerate(unique_values):
            curr_mapping_table[v] = inx + 1
            non_numeric_data[col] = non_numeric_data[col]. replace(v, inx + 1)

        mapping_table[col] = curr_mapping_table

    print(non_numeric_data. head())
    # MSZoning Street Alley ... MiscFeature SaleType SaleCondition
    # 0 1 1 1 ... 1 1 1
    # 1 1 1 1 ... 1 1 1
    # 2 1 1 1 ... 1 1 1
    # 3 1 1 1 ... 1 1 2
    # 4 1 1 1 ... 1 1 1
    #
    # [5 rows x 43 columns]


    return non_numeric_data


# Define network ---------------------------------------------
class Net(nn.Module):
    def __init__(self, D_in, H1, H2, H3, D_out):
        super(Net, self).__init__()

        self.linear1 = nn.Linear(D_in, H1)
        self.linear2 = nn.Linear(H1, H2)
        self.linear3 = nn.Linear(H2, H3)
        self.linear4 = nn.Linear(H3, D_out)

    def forward(self, x):
        y_pred = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(y_pred).clamp(min=0)
        y_pred = self.linear3(y_pred).clamp(min=0)
        y_pred = self. linear4(y_pred)
        return y_pred


def draw_losses(loss_list):
    # Draw the change trend of loss value
    plt.figure(figsize=(6, 4))
    plt.plot(range(len(loss_list)), losses, label='loss_value')

    plt. legend(loc='upper right')
    plt. show()



if __name__ == '__main__':

    # Training part =================================================== ===========================
    print(os. listdir("./data"))
    # ['data_description.txt', 'test.csv', 'train.csv']

    # Load training data ---------------------------------------------- ------
    raw_data = pd.read_csv('./data/train.csv')
    raw_data. describe()
    print(raw_data. describe())
    # Id MSSubClass ... YrSold SalePrice
    # count 1460.000000 1460.000000 ... 1460.000000 1460.000000
    # mean 730.500000 56.897260 ... 2007.815753 180921.195890
    # std 421.610009 42.300571 ... 1.328095 79442.502883
    # min 1.000000 20.000000 ... 2006.000000 34900.000000
    # 25% 365.750000 20.000000 ... 2007.000000 129975.000000
    # 50% 730.500000 50.000000 ... 2008.000000 163000.000000
    # 75% 1095.250000 70.000000 ... 2009.000000 214000.000000
    # max 1460.000000 190.000000 ... 2010.000000 755000.000000
    #
    # [8 rows x 38 columns]
    raw_data.head(10)
    print(raw_data. head(10))
    # Id MSSubClass MSZoning ... SaleType SaleCondition SalePrice
    # 0 1 60 RL ... WD Normal 208500
    # 1 2 20 RL ... WD Normal 181500
    # 2 3 60 RL ... WD Normal 223500
    # 3 4 70 RL ... WD Abnorml 140000
    # 4 5 60 RL ... WD Normal 250000
    # 5 6 50 RL ... WD Normal 143000
    # 6 7 20 RL ... WD Normal 307000
    # 7 8 60 RL ... WD Normal 200000
    # 8 9 50 RM ... WD Abnorml 129900
    # 9 10 190 RL ... WD Normal 118000
    #
    # [10 rows x 81 columns]
    #
    # Process finished with exit code 0


    # Separate numeric sample columns, non-numeric sample columns, label columns -------------------------------------- ----------------------
    # numeric_colmuns, non_numeric_columns, label_column
    label_column = 'SalePrice'
    numeric_colmuns = []
    numeric_colmuns.extend(list(raw_data.dtypes[raw_data.dtypes == np.int64].index))
    numeric_colmuns.extend(list(raw_data.dtypes[raw_data.dtypes == np.float64].index))
    numeric_colmuns.remove(label_column) # delete the price column
    numeric_colmuns.remove('Id') # delete the id column

    non_numeric_columns = [col for col in list(raw_data.columns) if col not in numeric_colmuns]
    non_numeric_columns.remove(label_column) # delete the price column
    non_numeric_columns.remove('Id') # delete the id column
    # -------------------------------------------------- ------------------------------------------


    # Clean digital samples, non-digital samples, labels ---------------------------------------- --
    numeric_data = DataFrame(raw_data, columns=numeric_colmuns)
    numeric_data = clean_num(numeric_data)

    non_numeric_data = DataFrame(raw_data, columns=non_numeric_columns)
    non_numeric_data = clean_text(non_numeric_data)

    y_data = DataFrame(raw_data, columns=[label_column])
    y_data = clean_num(y_data)
    # -------------------------------------------------- -------------------
    
    # A dictionary used to record normalization parameters [this set of parameters must be used for normalization during training and data restoration during prediction, and do not recalculate parameters during prediction, otherwise it will have a greater impact on the prediction results]
    means_dict, maxs_dict, mins_dict = dict(), dict(), dict()
    
    # merge pandas
    x_df = DataFrame(pd. concat([numeric_data, non_numeric_data], axis=1), columns=numeric_colmuns + non_numeric_columns)
    # Record normalized parameters
    for col in x_df:
        means_dict[col] = x_df[col].mean()
        maxs_dict[col] = x_df[col].max()
        mins_dict[col] = x_df[col].min()
    # Normalize to [-1,1]
    for col in x_df:
        x_df[col] = (x_df[col] - means_dict[col]) / (maxs_dict[col] - mins_dict[col])
    print(x_df. head())
    # MSSubClass LotArea OverallQual ... MiscFeature SaleType SaleCondition
    # 0 0.018251 -0.009661 0.100076 ... -0.010788 -0.029366 -0.07
    # 1 -0.217043 -0.004285 -0.011035 ... -0.010788 -0.029366 -0.07
    # 2 0.018251 0.003427 0.100076 ... -0.010788 -0.029366 -0.07
    # 3 0.077075 -0.004519 0.100076 ... -0.010788 -0.029366 0.13
    # 4 0.018251 0.017496 0.211187 ... -0.010788 -0.029366 -0.07

    y_df = DataFrame(y_data, columns=[label_column])
    # Record normalized parameters
    for col in y_df:
        means_dict[col] = y_df[col].mean()
        maxs_dict[col] = y_df[col].max()
        mins_dict[col] = y_df[col].min()
    # Normalize to [-1,1]
    for col in y_df:
        y_df[col] = (y_df[col] - means_dict[col]) / (maxs_dict[col] - mins_dict[col])
    print(y_df. head())
    #SalePrice
    #0 0.038299
    #1 0.000804
    #2 0.059129
    #3 -0.056827
    #4 0.095929

    # turn tensor
    x_tensor = torch.tensor(x_df.values, dtype=torch.float)
    y_tensor = torch.tensor(y_df.values, dtype=torch.float)

    print(x_tensor.shape, y_tensor.shape)
    # torch. Size([1460, 79]) torch. Size([1460, 1])


    # Define input and output layer dimensions
    D_in, D_out = x_tensor.shape[1], y_tensor.shape[1]
    # Define the middle layer network dimension
    H1, H2, H3 = 500, 1000, 200

    # Initialize the network
    model = Net(D_in, H1, H2, H3, D_out)
    # Define the loss function (mean squared error)
    criterion = nn.MSELoss(reduction='sum')
    # Define optimizer, learning rate
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4 * 2)

    # train
    losses = []
    for t in range(500):
        y_pred = model(x_tensor) # forward propagation

        loss = criterion(y_pred, y_tensor) # calculate the loss
        print(t, loss. item())
        losses.append(loss.item()) # record loss

        # Terminate the training when encountering a nan value (the nan value has been cleaned before, and there should be no nan here)
        if torch.isnan(loss):
            break

        optimizer.zero_grad() # Gradient cleared
        loss.backward() # backpropagation
        optimizer.step() # update the gradient

    # Draw the loss value change image ------------
    draw_losses(loss_list=losses)
    # ---------------------------


    # Prediction part =================================================== ==================================
    # Load forecast data
    raw_test_data = pd.read_csv('./data/test.csv')
    print(raw_test_data. describe())
    # Id MSSubClass ... MoSold YrSold
    # count 1459.000000 1459.000000 ... 1459.000000 1459.000000
    # mean 2190.000000 57.378341 ... 6.104181 2007.769705
    # std 421.321334 42.746880 ... 2.722432 1.301740
    # min 1461.000000 20.000000 ... 1.000000 2006.000000
    # 25% 1825.500000 20.000000 ... 4.000000 2007.000000
    # 50% 2190.000000 50.000000 ... 6.000000 2008.000000
    # 75% 2554.500000 70.000000 ... 8.000000 2009.000000
    # max 2919.000000 190.000000 ... 12.000000 2010.000000
    #
    # [8 rows x 37 columns]
    print(raw_test_data. describe())
    # MSSubClass LotArea ... MasVnrArea GarageYrBlt
    # count 1459.000000 1459.000000 ... 1444.000000 1381.000000
    # mean 57.378341 9819.161069 ... 100.709141 1977.721217
    # std 42.746880 4955.517327 ... 177.625900 26.431175
    # min 20.000000 1470.000000 ... 0.000000 1895.000000
    # 25% 20.000000 7391.000000 ... 0.000000 1959.000000
    # 50% 50.000000 9399.000000 ... 0.000000 1979.000000
    # 75% 70.000000 11517.500000 ... 164.000000 2002.000000
    # max 190.000000 56600.000000 ... 1290.000000 2207.000000

    # Clean digital samples, non-digital samples ------------------------------------------
    numeric_data = DataFrame(raw_test_data, columns=numeric_colmuns)
    numeric_data = clean_num(numeric_data)

    non_numeric_data = DataFrame(raw_test_data, columns=non_numeric_columns)
    non_numeric_data = clean_text(non_numeric_data)
    # -------------------------------------------------- -------------------

    # merge pandas
    x_df = DataFrame(pd. concat([numeric_data, non_numeric_data], axis=1), columns=numeric_colmuns + non_numeric_columns)
    # Normalized
    for col in x_df.columns:
        x_df[col] = (x_df[col] - means_dict[col]) / (maxs_dict[col] - mins_dict[col])

    print(x_df. head())
    # MSSubClass LotArea OverallQual ... MiscFeature SaleType SaleCondition
    # 0 -0.338813 -0.178109 -0.688743 ... -0.010788 -0.029366 -0.07
    # 1 -0.338813 -0.178108 -0.676398 ... 0.239212 -0.029366 -0.07
    # 2 -0.337429 -0.178108 -0.688743 ... -0.010788 -0.029366 -0.07
    # 3 -0.337429 -0.178109 -0.676398 ... -0.010788 -0.029366 -0.07
    # 4 -0.335353 -0.178111 -0.651706 ... -0.010788 -0.029366 -0.07
    #
    # [5 rows x 79 columns]

    # turn tensor
    x_tensor = torch.tensor(x_df.values, dtype=torch.float)

    print(x_tensor. shape)
    # torch. Size([1459, 79])

    test_y = model(x_tensor)
    print(test_y)

    # transfer to pandas
    result = DataFrame(test_y.data.numpy(), columns=[label_column])
    result[label_column] = result[label_column].fillna(0)

    # Normalized data restoration
    result[label_column] = result[label_column] * (maxs_dict[label_column] - mins_dict[label_column]) + means_dict[label_column]

    # add id column
    result['Id'] = np.array(result.index)

    # adjust column order
    result = DataFrame(result, columns=['Id', label_column])

    # print prediction results
    print(result)
    # Id SalePrice
    # 0 0 125925.795535
    # 1 1 159832.054257
    # 2 2 170479.123832
    # 3 3 181463.404637
    # 4 4 177941.813524
    # ... ... ...
    # 1454 1454 104792.782963
    # 1455 1455 107778.539142
    #1456 1456 173118.472892
    #1457 1457 125831.143327
    #1458 1458 237642.311684
    #
    # [1459 rows x 2 columns]

Loss image ↓