# 第一部分：了解你的指标

## 月收入

``````# import libraries
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division

import plotly.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go

#initiate visualization library for jupyter notebook
pyoff.init_notebook_mode()

``````

• 客户ID
• 单价
• 数量
• 订单日期

``````#converting the type of Invoice Date Field from string to datetime.
tx_data['InvoiceDate'] = pd.to_datetime(tx_data['InvoiceDate'])

#creating YearMonth field for the ease of reporting and visualization
tx_data['InvoiceYearMonth'] = tx_data['InvoiceDate'].map(lambda date: 100*date.year + date.month)

#calculate Revenue for each row and create a new dataframe with YearMonth - Revenue columns
tx_data['Revenue'] = tx_data['UnitPrice'] * tx_data['Quantity']
tx_revenue = tx_data.groupby(['InvoiceYearMonth'])['Revenue'].sum().reset_index()
tx_revenue
``````

``````#X and Y axis inputs for Plotly graph. We use Scatter for line graphs
plot_data = [
go.Scatter(
x=tx_revenue['InvoiceYearMonth'],
y=tx_revenue['Revenue'],
)
]

plot_layout = go.Layout(
xaxis={"type": "category"},
title='Montly Revenue'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
``````

``````#using pct_change() function to see monthly percentage change
tx_revenue['MonthlyGrowth'] = tx_revenue['Revenue'].pct_change()

#showing first 5 rows

#visualization - line graph
plot_data = [
go.Scatter(
x=tx_revenue.query("InvoiceYearMonth < 201112")['InvoiceYearMonth'],
y=tx_revenue.query("InvoiceYearMonth < 201112")['MonthlyGrowth'],
)
]

plot_layout = go.Layout(
xaxis={"type": "category"},
title='Montly Growth Rate'
)

fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
``````

## 月活跃客户

``````#creating a new dataframe with UK customers only
tx_uk = tx_data.query("Country=='United Kingdom'").reset_index(drop=True)

#creating monthly active customers dataframe by counting unique Customer IDs
tx_monthly_active = tx_uk.groupby('InvoiceYearMonth')['CustomerID'].nunique().reset_index()

#print the dataframe
tx_monthly_active

#plotting the output
plot_data = [
go.Bar(
x=tx_monthly_active['InvoiceYearMonth'],
y=tx_monthly_active['CustomerID'],
)
]

plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Active Customers'
)

fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
``````

4月份，月活跃客户数量从923个降至817个(-11.5%)。我们还会看到订单数量的同样趋势。

## 月订单数量

``````#create a new dataframe for no. of order by using quantity field
tx_monthly_sales = tx_uk.groupby('InvoiceYearMonth')['Quantity'].sum().reset_index()

#print the dataframe
tx_monthly_sales

#plot
plot_data = [
go.Bar(
x=tx_monthly_sales['InvoiceYearMonth'],
y=tx_monthly_sales['Quantity'],
)
]

plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Total # of Order'
)

fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
``````

## 平均订单收入

``````# create a new dataframe for average revenue by taking the mean of it
tx_monthly_order_avg = tx_uk.groupby('InvoiceYearMonth')['Revenue'].mean().reset_index()

#print the dataframe
tx_monthly_order_avg

#plot the bar chart
plot_data = [
go.Bar(
x=tx_monthly_order_avg['InvoiceYearMonth'],
y=tx_monthly_order_avg['Revenue'],
)
]

plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Order Average'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
``````

• 新客户比例：如果我们正在失去现有的客户或无法吸引新的客户，这是一个很好的指标
• 留存率：指标之王。指示在特定时间段内我们保留了多少客户。我们将展示月度留存率和基于群组的留存率的例子。

## 新客户比例

``````#create a dataframe contaning CustomerID and first purchase date
tx_min_purchase = tx_uk.groupby('CustomerID').InvoiceDate.min().reset_index()
tx_min_purchase.columns = ['CustomerID','MinPurchaseDate']
tx_min_purchase['MinPurchaseYearMonth'] = tx_min_purchase['MinPurchaseDate'].map(lambda date: 100*date.year + date.month)

#merge first purchase date column to our main dataframe (tx_uk)
tx_uk = pd.merge(tx_uk, tx_min_purchase, on='CustomerID')

#create a column called User Type and assign Existing
#if User's First Purchase Year Month before the selected Invoice Year Month
tx_uk['UserType'] = 'New'
tx_uk.loc[tx_uk['InvoiceYearMonth']>tx_uk['MinPurchaseYearMonth'],'UserType'] = 'Existing'

#calculate the Revenue per month for each user type
tx_user_type_revenue = tx_uk.groupby(['InvoiceYearMonth','UserType'])['Revenue'].sum().reset_index()

#filtering the dates and plot the result
tx_user_type_revenue = tx_user_type_revenue.query("InvoiceYearMonth != 201012 and InvoiceYearMonth != 201112")
plot_data = [
go.Scatter(
x=tx_user_type_revenue.query("UserType == 'Existing'")['InvoiceYearMonth'],
y=tx_user_type_revenue.query("UserType == 'Existing'")['Revenue'],
name = 'Existing'
),
go.Scatter(
x=tx_user_type_revenue.query("UserType == 'New'")['InvoiceYearMonth'],
y=tx_user_type_revenue.query("UserType == 'New'")['Revenue'],
name = 'New'
)
]

plot_layout = go.Layout(
xaxis={"type": "category"},
title='New vs Existing'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
``````

``````#create a dataframe that shows new user ratio - we also need to drop NA values (first month new user ratio is 0)
tx_user_ratio = tx_uk.query("UserType == 'New'").groupby(['InvoiceYearMonth'])['CustomerID'].nunique()/tx_uk.query("UserType == 'Existing'").groupby(['InvoiceYearMonth'])['CustomerID'].nunique()
tx_user_ratio = tx_user_ratio.reset_index()
tx_user_ratio = tx_user_ratio.dropna()

#print the dafaframe
tx_user_ratio

#plot the result

plot_data = [
go.Bar(
x=tx_user_ratio.query("InvoiceYearMonth>201101 and InvoiceYearMonth<201112")['InvoiceYearMonth'],
y=tx_user_ratio.query("InvoiceYearMonth>201101 and InvoiceYearMonth<201112")['CustomerID'],
)
]

plot_layout = go.Layout(
xaxis={"type": "category"},
title='New Customer Ratio'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
``````

## 月留存率

``````#identify which users are active by looking at their revenue per month
tx_user_purchase = tx_uk.groupby(['CustomerID','InvoiceYearMonth'])['Revenue'].sum().reset_index()

#create retention matrix with crosstab
tx_retention = pd.crosstab(tx_user_purchase['CustomerID'], tx_user_purchase['InvoiceYearMonth']).reset_index()

#create an array of dictionary which keeps Retained & Total User count for each month
months = tx_retention.columns[2:]
retention_array = []
for i in range(len(months)-1):
retention_data = {}
selected_month = months[i+1]
prev_month = months[i]
retention_data['InvoiceYearMonth'] = int(selected_month)
retention_data['TotalUserCount'] = tx_retention[selected_month].sum()
retention_data['RetainedUserCount'] = tx_retention[(tx_retention[selected_month]>0) & (tx_retention[prev_month]>0)][selected_month].sum()
retention_array.append(retention_data)

#convert the array to dataframe and calculate Retention Rate
tx_retention = pd.DataFrame(retention_array)
tx_retention['RetentionRate'] = tx_retention['RetainedUserCount']/tx_retention['TotalUserCount']

#plot the retention rate graph
plot_data = [
go.Scatter(
x=tx_retention.query("InvoiceYearMonth<201112")['InvoiceYearMonth'],
y=tx_retention.query("InvoiceYearMonth<201112")['RetentionRate'],
name="organic"
)

]

plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Retention Rate'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
``````

## 基于群体的留存率

``````#create our retention table again with crosstab() - we need to change the column names for using them in .query() function
tx_retention = pd.crosstab(tx_user_purchase['CustomerID'], tx_user_purchase['InvoiceYearMonth']).reset_index()
new_column_names = [ 'm_' + str(column) for column in tx_retention.columns]
tx_retention.columns = new_column_names

#create the array of Retained users for each cohort monthly
retention_array = []
for i in range(len(months)):
retention_data = {}
selected_month = months[i]
prev_months = months[:i]
next_months = months[i+1:]
for prev_month in prev_months:
retention_data[prev_month] = np.nan

total_user_count =  retention_data['TotalUserCount'] = tx_retention['m_' + str(selected_month)].sum()
retention_data[selected_month] = 1

query = "{} > 0".format('m_' + str(selected_month))

for next_month in next_months:
query = query + " and {} > 0".format(str('m_' + str(next_month)))
retention_data[next_month] = np.round(tx_retention.query(query)['m_' + str(next_month)].sum()/total_user_count,2)
retention_array.append(retention_data)

tx_retention = pd.DataFrame(retention_array)
tx_retention.index = months

#showing new cohort based retention table
tx_retention
``````

Tx_retention就是这个基于群体的留存率视图：

No account yet? Register