# Data format processing import pandas as pd import numpy as np # Temporal feature extraction import datetime # Ignore warnings: prevent them from interfering with program operation import warnings warnings.filterwarnings('ignore') # Set the data set path path = 'E:/Programming/Capital Inflow and Outflow Forecast/Purchase Redemption Data/' # Extract timestamp: extract day, month, year, week from date # Define functions and annotate the types of parameters and return values to increase readability def add_timestamp(data: pd.DataFrame, time_index: str = 'report_date') -> pd.DataFrame: # time_index is the index field name # Create a deep copy of data: parameter passing data_balance = data.copy() # Convert MySQL type date to 'year month day' format data_balance['date'] = pd.to_datetime(data_balance[time_index], format="%Y%m%d") #Extract timing features data_balance['day'] = data_balance['date'].dt.day data_balance['month'] = data_balance['date'].dt.month data_balance['year'] = data_balance['date'].dt.year data_balance['weekday'] = data_balance['date'].dt.weekday '''During data cleaning, rows with null values will be deleted. At this time, the DataFrame or Series type data is no longer a continuous index''' # Use reset_index(drop=True) to reset the original index columns without adding new columns return data_balance.reset_index(drop=True) # Load user subscription and redemption data table def load_data(path: str = 'user_balance_table.csv') -> pd.DataFrame: # Read the data set file data_balance = pd.read_csv(path) #Extract timestamp data_balance = add_timestamp(data_balance) return data_balance.reset_index(drop=True) # Statistics of the total daily subscription and redemption amounts: intercept the data in the known time range 2014-03-31~2014-08-31 def get_total_balance(data: pd.DataFrame, date: str = '2014-03-31') -> pd.DataFrame: total_balance = data.copy() # Aggregate time data, rows 'total_purchase_amt' and 'total_redeem_amt', columns are different 'date' # Group the total statistics by date, find the sum of the 'total_purchase_amt' and 'total_redeem_amt' fields of all users on each date, and generate a table total_balance = total_balance.groupby(total_balance['date'])[['total_purchase_amt', 'total_redeem_amt']].sum() # Use reset_index(inplace=True) to reset the index in place without creating new objects total_balance.reset_index(inplace=True) # Since the 'total_purchase_amt' and 'total_redeem_amt' fields after September are cleared with null values, only the data from 2014-03-31~2014-08-31 are fetched here. return total_balance[(total_balance['date'] >= date)].reset_index(drop=True) # Some of the approximately 30,000 randomly selected users first appeared in September 2014, and these users are only in the test data # Generate test set section data: intercept the data in the prediction time range 2014,9,1~2014,9,30 def generate_test_data(data: pd.DataFrame) -> pd.DataFrame: # Create a deep copy of the data: ensure that the original table data remains unchanged total_balance = data.copy() # Specify starting date start = datetime.datetime(2014, 9, 1) # Initialization of the two-dimensional array used to store test set section data testdata = [] # Specify expiration date while start != datetime.datetime(2014, 10, 1): # Fill in the date column and leave the other two fields blank. temp = [start, np.nan, np.nan] testdata.append(temp) start + = datetime.timedelta(days=1) #Convert two-dimensional array to data frame format testdata = pd.DataFrame(testdata) # Index by date, fill in the total subscription amount and total redemption amount in the empty columns testdata.columns = total_balance.columns # Vertically splice training set data and test set data total_balance = pd.concat([total_balance, testdata], axis=0) return total_balance.reset_index(drop=True) '''Data preprocessing''' balance_data = load_data(path + 'user_balance_table.csv') total_balance = get_total_balance(balance_data, date='2014-03-01') total_balance = generate_test_data(total_balance) total_balance = add_timestamp(total_balance, 'date') # Specifies date as the index # Define the method for generating time series rule prediction results def generate_base(data: pd.DataFrame, month_index: int) -> pd.DataFrame: # Create a deep copy of the data: ensure that the original table data remains unchanged total_balance = data.copy() # Get the data of the three fields 'date', 'total_purchase_amt', 'total_redeem_amt' total_balance = total_balance[['date', 'total_purchase_amt', 'total_redeem_amt']] # Select a fixed time period data set: month_index is assigned 4~9 in sequence. total_balance = total_balance[ (total_balance['date'] >= datetime.datetime(2014, 3, 1)) & amp; (total_balance['date'] < datetime.datetime(2014, month_index, 1))] #Add timestamp total_balance['weekday'] = total_balance['date'].dt.weekday total_balance['day'] = total_balance['date'].dt.day total_balance['month'] = total_balance['date'].dt.month # Count factors for next day # # Using the week as the index, count the average of the total subscription amount and the average of the total redemption amount of all users on each working day. mean_of_each_weekday = total_balance[['weekday'] + ['total_purchase_amt', 'total_redeem_amt']]\ .groupby('weekday', as_index=False).mean() # # Modify the field name to store new features for name in ['total_purchase_amt', 'total_redeem_amt']: mean_of_each_weekday = mean_of_each_weekday.rename(columns={name: name + '_weekdaymean'}) # # Calculate the factor for the next day: the above average is divided by the average of the total subscription amount of all users on each day and the average of the total redemption amount of each day. mean_of_each_weekday['total_purchase_amt_weekdaymean'] /= np.mean(total_balance['total_purchase_amt']) mean_of_each_weekday['total_redeem_amt_weekdaymean'] /= np.mean(total_balance['total_redeem_amt']) # Merge statistical results into the original data set: the connection key is the 'weekday' field, which is the index row of the left parameter total_balance = pd.merge(total_balance, mean_of_each_weekday, on='weekday', how='left') # Count the frequency of occurrences on (1~31) in the next day respectively # # Behavior 'weekday', column 'day', count the frequency of 'weekday' appearing in 'day' weekday_count = total_balance[['day', 'weekday', 'date']].groupby(['day', 'weekday'], as_index=False).count() # # Merge frequency with next day factor, the connection key is 'weekday' weekday_count = pd.merge(weekday_count, mean_of_each_weekday, on='weekday') # Weight the next day factor based on frequency to obtain the date factor # # The unique function removes duplicates, and the len function counts the column length of 'month' to get the number of months. # # The weight is the average number of next days of each month weekday_count['total_purchase_amt_weekdaymean'] *= weekday_count['date'] / len(np.unique(total_balance['month'])) weekday_count['total_redeem_amt_weekdaymean'] *= weekday_count['date'] / len(np.unique(total_balance['month'])) day_rate = weekday_count.drop(['weekday', 'date'], axis=1).groupby('day', as_index=False).sum() # Remove the date residual from the mean of all dates in the training set to get the base day_mean = total_balance[['day'] + ['total_purchase_amt', 'total_redeem_amt']].groupby('day', as_index=False).mean() day_pre = pd.merge(day_mean, day_rate, on='day', how='left') day_pre['total_purchase_amt'] /= day_pre['total_purchase_amt_weekdaymean'] day_pre['total_redeem_amt'] /= day_pre['total_redeem_amt_weekdaymean'] # Generate test set data ''' iterrows: Generate a two-dimensional tuple from a data frame, traverse in order of index, and assign the elements of the one-dimensional tuple to variables One-dimensional tuple: the first element is the index, and the second element is the dictionary consisting of the key-value pair consisting of the data of the row corresponding to the index and its field ''' for index, row in day_pre.iterrows(): if month_index in (2, 4, 6, 9) and row['day'] == 31: break day_pre.loc[index, 'date'] = datetime.datetime(2014, month_index, int(row['day'])) # Obtain the final prediction result based on base and next day factor day_pre['weekday'] = day_pre.date.dt.weekday day_pre = day_pre[['date', 'weekday'] + ['total_purchase_amt', 'total_redeem_amt']] day_pre = pd.merge(day_pre, mean_of_each_weekday, on='weekday') day_pre['total_purchase_amt'] *= day_pre['total_purchase_amt_weekdaymean'] day_pre['total_redeem_amt'] *= day_pre['total_redeem_amt_weekdaymean'] # # Round the prediction results day_pre['total_purchase_amt'] = np.rint(day_pre['total_purchase_amt']) day_pre['total_redeem_amt'] = np.rint(day_pre['total_redeem_amt']) day_pre = day_pre.sort_values('date')[['date'] + ['total_purchase_amt', 'total_redeem_amt']] return day_pre # Create a deep copy of the data: ensure that the original table data remains unchanged data = total_balance.copy() # Generate prediction results # # Initialization of the list used to store prediction results base_list = [] # # Revise the model month by month to predict September data for i in range(4, 10): base_list.append(generate_base(data, i).reset_index(drop=True)) # # Get a two-dimensional array by splicing the concat function base = pd.concat(base_list).reset_index(drop=True) # # Rename the field that stores prediction results for i in ['total_purchase_amt', 'total_redeem_amt']: base = base.rename(columns={i: i + '_base'}) # # Merge the data set with the prediction results data = pd.merge(data.reset_index(drop=True), base.reset_index(drop=True), on='date', how='left').reset_index(drop=True) # Rename the result table # # Select the fields that need to be submitted data = data[['date', 'total_purchase_amt_base', 'total_redeem_amt_base']] # # Remove irrelevant date data data = data[total_balance['date'] >= datetime.datetime(2014, 9, 1)] # # Remove the '-' from the date and the '.0' from the data for i in data.columns: if i == 'date': data[i] = data[i].astype(str) data[i] = data[i].str.replace('-', '') else: data[i] = data[i].astype(np.int64) # Save prediction results locally: do not display index values and column names data.to_csv(path + 'comp_predict_table.csv', index=False, header=False)