Forecast of capital inflow and outflow-understanding and running through the baseline

# Data format processing
import pandas as pd
import numpy as np

# Temporal feature extraction
import datetime

# Ignore warnings: prevent them from interfering with program operation
import warnings
warnings.filterwarnings('ignore')

# Set the data set path
path = 'E:/Programming/Capital Inflow and Outflow Forecast/Purchase Redemption Data/'


# Extract timestamp: extract day, month, year, week from date
# Define functions and annotate the types of parameters and return values to increase readability
def add_timestamp(data: pd.DataFrame, time_index: str = 'report_date') -> pd.DataFrame: # time_index is the index field name
    # Create a deep copy of data: parameter passing
    data_balance = data.copy()
    # Convert MySQL type date to 'year month day' format
    data_balance['date'] = pd.to_datetime(data_balance[time_index], format="%Y%m%d")
    #Extract timing features
    data_balance['day'] = data_balance['date'].dt.day
    data_balance['month'] = data_balance['date'].dt.month
    data_balance['year'] = data_balance['date'].dt.year
    data_balance['weekday'] = data_balance['date'].dt.weekday
    '''During data cleaning, rows with null values will be deleted. At this time, the DataFrame or Series type data is no longer a continuous index'''
    # Use reset_index(drop=True) to reset the original index columns without adding new columns
    return data_balance.reset_index(drop=True)


# Load user subscription and redemption data table
def load_data(path: str = 'user_balance_table.csv') -> pd.DataFrame:
    # Read the data set file
    data_balance = pd.read_csv(path)
    #Extract timestamp
    data_balance = add_timestamp(data_balance)
    return data_balance.reset_index(drop=True)


# Statistics of the total daily subscription and redemption amounts: intercept the data in the known time range 2014-03-31~2014-08-31
def get_total_balance(data: pd.DataFrame, date: str = '2014-03-31') -> pd.DataFrame:
    total_balance = data.copy()
    # Aggregate time data, rows 'total_purchase_amt' and 'total_redeem_amt', columns are different 'date'
    # Group the total statistics by date, find the sum of the 'total_purchase_amt' and 'total_redeem_amt' fields of all users on each date, and generate a table
    total_balance = total_balance.groupby(total_balance['date'])[['total_purchase_amt', 'total_redeem_amt']].sum()
    # Use reset_index(inplace=True) to reset the index in place without creating new objects
    total_balance.reset_index(inplace=True)
    # Since the 'total_purchase_amt' and 'total_redeem_amt' fields after September are cleared with null values, only the data from 2014-03-31~2014-08-31 are fetched here.
    return total_balance[(total_balance['date'] >= date)].reset_index(drop=True)


# Some of the approximately 30,000 randomly selected users first appeared in September 2014, and these users are only in the test data
# Generate test set section data: intercept the data in the prediction time range 2014,9,1~2014,9,30
def generate_test_data(data: pd.DataFrame) -> pd.DataFrame:
    # Create a deep copy of the data: ensure that the original table data remains unchanged
    total_balance = data.copy()
    # Specify starting date
    start = datetime.datetime(2014, 9, 1)
    # Initialization of the two-dimensional array used to store test set section data
    testdata = []
    # Specify expiration date
    while start != datetime.datetime(2014, 10, 1):
        # Fill in the date column and leave the other two fields blank.
        temp = [start, np.nan, np.nan]
        testdata.append(temp)
        start + = datetime.timedelta(days=1)
    #Convert two-dimensional array to data frame format
    testdata = pd.DataFrame(testdata)
    # Index by date, fill in the total subscription amount and total redemption amount in the empty columns
    testdata.columns = total_balance.columns
    # Vertically splice training set data and test set data
    total_balance = pd.concat([total_balance, testdata], axis=0)
    return total_balance.reset_index(drop=True)


'''Data preprocessing'''
balance_data = load_data(path + 'user_balance_table.csv')
total_balance = get_total_balance(balance_data, date='2014-03-01')
total_balance = generate_test_data(total_balance)
total_balance = add_timestamp(total_balance, 'date') # Specifies date as the index


# Define the method for generating time series rule prediction results
def generate_base(data: pd.DataFrame, month_index: int) -> pd.DataFrame:
    # Create a deep copy of the data: ensure that the original table data remains unchanged
    total_balance = data.copy()
    # Get the data of the three fields 'date', 'total_purchase_amt', 'total_redeem_amt'
    total_balance = total_balance[['date', 'total_purchase_amt', 'total_redeem_amt']]
    # Select a fixed time period data set: month_index is assigned 4~9 in sequence.
    total_balance = total_balance[
        (total_balance['date'] >= datetime.datetime(2014, 3, 1)) & amp;
        (total_balance['date'] < datetime.datetime(2014, month_index, 1))]

    #Add timestamp
    total_balance['weekday'] = total_balance['date'].dt.weekday
    total_balance['day'] = total_balance['date'].dt.day
    total_balance['month'] = total_balance['date'].dt.month

    # Count factors for next day
    # # Using the week as the index, count the average of the total subscription amount and the average of the total redemption amount of all users on each working day.
    mean_of_each_weekday = total_balance[['weekday'] + ['total_purchase_amt', 'total_redeem_amt']]\
        .groupby('weekday', as_index=False).mean()
    # # Modify the field name to store new features
    for name in ['total_purchase_amt', 'total_redeem_amt']:
        mean_of_each_weekday = mean_of_each_weekday.rename(columns={name: name + '_weekdaymean'})
    # # Calculate the factor for the next day: the above average is divided by the average of the total subscription amount of all users on each day and the average of the total redemption amount of each day.
    mean_of_each_weekday['total_purchase_amt_weekdaymean'] /= np.mean(total_balance['total_purchase_amt'])
    mean_of_each_weekday['total_redeem_amt_weekdaymean'] /= np.mean(total_balance['total_redeem_amt'])

    # Merge statistical results into the original data set: the connection key is the 'weekday' field, which is the index row of the left parameter
    total_balance = pd.merge(total_balance, mean_of_each_weekday, on='weekday', how='left')

    # Count the frequency of occurrences on (1~31) in the next day respectively
    # # Behavior 'weekday', column 'day', count the frequency of 'weekday' appearing in 'day'
    weekday_count = total_balance[['day', 'weekday', 'date']].groupby(['day', 'weekday'], as_index=False).count()
    # # Merge frequency with next day factor, the connection key is 'weekday'
    weekday_count = pd.merge(weekday_count, mean_of_each_weekday, on='weekday')

    # Weight the next day factor based on frequency to obtain the date factor
    # # The unique function removes duplicates, and the len function counts the column length of 'month' to get the number of months.
    # # The weight is the average number of next days of each month
    weekday_count['total_purchase_amt_weekdaymean'] *= weekday_count['date'] / len(np.unique(total_balance['month']))
    weekday_count['total_redeem_amt_weekdaymean'] *= weekday_count['date'] / len(np.unique(total_balance['month']))
    day_rate = weekday_count.drop(['weekday', 'date'], axis=1).groupby('day', as_index=False).sum()

    # Remove the date residual from the mean of all dates in the training set to get the base
    day_mean = total_balance[['day'] + ['total_purchase_amt', 'total_redeem_amt']].groupby('day', as_index=False).mean()
    day_pre = pd.merge(day_mean, day_rate, on='day', how='left')
    day_pre['total_purchase_amt'] /= day_pre['total_purchase_amt_weekdaymean']
    day_pre['total_redeem_amt'] /= day_pre['total_redeem_amt_weekdaymean']

    # Generate test set data
    ''' iterrows: Generate a two-dimensional tuple from a data frame, traverse in order of index, and assign the elements of the one-dimensional tuple to variables
        One-dimensional tuple: the first element is the index, and the second element is the dictionary consisting of the key-value pair consisting of the data of the row corresponding to the index and its field '''
    for index, row in day_pre.iterrows():
        if month_index in (2, 4, 6, 9) and row['day'] == 31:
            break
        day_pre.loc[index, 'date'] = datetime.datetime(2014, month_index, int(row['day']))

    # Obtain the final prediction result based on base and next day factor
    day_pre['weekday'] = day_pre.date.dt.weekday
    day_pre = day_pre[['date', 'weekday'] + ['total_purchase_amt', 'total_redeem_amt']]
    day_pre = pd.merge(day_pre, mean_of_each_weekday, on='weekday')
    day_pre['total_purchase_amt'] *= day_pre['total_purchase_amt_weekdaymean']
    day_pre['total_redeem_amt'] *= day_pre['total_redeem_amt_weekdaymean']
    # # Round the prediction results
    day_pre['total_purchase_amt'] = np.rint(day_pre['total_purchase_amt'])
    day_pre['total_redeem_amt'] = np.rint(day_pre['total_redeem_amt'])

    day_pre = day_pre.sort_values('date')[['date'] + ['total_purchase_amt', 'total_redeem_amt']]
    return day_pre


# Create a deep copy of the data: ensure that the original table data remains unchanged
data = total_balance.copy()

# Generate prediction results
# # Initialization of the list used to store prediction results
base_list = []
# # Revise the model month by month to predict September data
for i in range(4, 10):
    base_list.append(generate_base(data, i).reset_index(drop=True))
# # Get a two-dimensional array by splicing the concat function
base = pd.concat(base_list).reset_index(drop=True)
# # Rename the field that stores prediction results
for i in ['total_purchase_amt', 'total_redeem_amt']:
    base = base.rename(columns={i: i + '_base'})
# # Merge the data set with the prediction results
data = pd.merge(data.reset_index(drop=True), base.reset_index(drop=True), on='date', how='left').reset_index(drop=True)

# Rename the result table
# # Select the fields that need to be submitted
data = data[['date', 'total_purchase_amt_base', 'total_redeem_amt_base']]
# # Remove irrelevant date data
data = data[total_balance['date'] >= datetime.datetime(2014, 9, 1)]
# # Remove the '-' from the date and the '.0' from the data
for i in data.columns:
    if i == 'date':
        data[i] = data[i].astype(str)
        data[i] = data[i].str.replace('-', '')
    else:
        data[i] = data[i].astype(np.int64)

# Save prediction results locally: do not display index values and column names
data.to_csv(path + 'comp_predict_table.csv', index=False, header=False)