Customer Transaction Analysis Framework
I'll give you a complete, production-ready analytical approach since I don't have your actual CSV, but this is fully executable code you can run directly on your data.
Step 1: Load & Profile the Data
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('transactions.csv', parse_dates=['date'])
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
print("Shape:", df.shape)
print("\nDtypes:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())
print("\nDuplicates:", df.duplicated().sum())
print("\nDate range:", df['date'].min(), "→", df['date'].max())
print("\nRegions:", df['region'].unique())
print("Categories:", df['category'].unique())
print("\nSpend stats:\n", df['purchase_amount'].describe())
Step 2: Spending Pattern Analysis
region_summary = df.groupby('region').agg(
total_spend = ('purchase_amount', 'sum'),
avg_transaction = ('purchase_amount', 'mean'),
median_transaction= ('purchase_amount', 'median'),
transaction_count = ('purchase_amount', 'count'),
unique_customers = ('customer_id', 'nunique')
).sort_values('total_spend', ascending=False)
region_summary['spend_per_customer'] = (
region_summary['total_spend'] / region_summary['unique_customers']
)
region_summary['pct_of_total'] = (
region_summary['total_spend'] / region_summary['total_spend'].sum() * 100
).round(2)
print("\n=== REGION PERFORMANCE ===")
print(region_summary.round(2).to_string())
category_summary = df.groupby('category').agg(
total_spend = ('purchase_amount', 'sum'),
avg_transaction = ('purchase_amount', 'mean'),
transaction_count = ('purchase_amount', 'count')
).sort_values('total_spend', ascending=False)
category_summary['pct_of_total'] = (
category_summary['total_spend'] / category_summary['total_spend'].sum() * 100
).round(2)
print("\n=== CATEGORY PERFORMANCE ===")
print(category_summary.round(2).to_string())
df['month'] = df['date'].dt.to_period('M')
df['day_of_week']= df['date'].dt.day_name()
df['hour'] = df['date'].dt.hour
monthly_spend = df.groupby('month')['purchase_amount'].agg(['sum','count','mean'])
monthly_spend.columns = ['total_spend','transactions','avg_spend']
monthly_spend['mom_growth_pct'] = monthly_spend['total_spend'].pct_change() * 100
print("\n=== MONTHLY TREND ===")
print(monthly_spend.round(2).to_string())
pivot = df.pivot_table(
values='purchase_amount',
index='region',
columns='category',
aggfunc='sum',
fill_value=0
)
print("\n=== REGION × CATEGORY SPEND ===")
print(pivot.round(0).to_string())
snapshot_date = df['date'].max() + pd.Timedelta(days=1)
rfm = df.groupby('customer_id').agg(
recency = ('date', lambda x: (snapshot_date - x.max()).days),
frequency = ('purchase_amount', 'count'),
monetary = ('purchase_amount', 'sum')
)
rfm['R'] = pd.qcut(rfm['recency'], q=4, labels=[4,3,2,1])
rfm['F'] = pd.qcut(rfm['frequency'].rank(method='first'), q=4, labels=[1,2,3,4])
rfm['M'] = pd.qcut(rfm['monetary'], q=4, labels=[1,2,3,4])
rfm['rfm_score'] = rfm['R'].astype(str) + rfm['F'].astype(str) + rfm['M'].astype(str)
def rfm_segment(row):
score = int(row['R']) + int(row['F']) + int(row['M'])
if score >= 10: return 'Champions'
elif score >= 8: return 'Loyal'
elif score >= 6: return 'At Risk'
elif score >= 4: return 'Needs Attention'
else: return 'About to Churn'
rfm['segment'] = rfm.apply(rfm_segment, axis=1)
seg_summary = rfm.groupby('segment').agg(
count = ('monetary', 'count'),
avg_spend = ('monetary', 'mean'),
avg_recency = ('recency', 'mean')
).sort_values('avg_spend', ascending=False)
print("\n=== RFM SEGMENTS ===")
print(seg_summary.round(2).to_string())
Step 3: Anomaly Detection (3-Layer Approach)
from scipy.stats import zscore
df['z_score'] = zscore(df['purchase_amount'])
statistical_outliers = df[df['z_score'].abs() > 3].copy()
print(f"\nZ-score outliers (|z|>3): {len(statistical_outliers)} rows")
Q1 = df['purchase_amount'].quantile(0.25)
Q3 = df['purchase_amount'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
iqr_outliers = df[
(df['purchase_amount'] < lower_bound) |
(df['purchase_amount'] > upper_bound)
].copy()
print(f"IQR outliers: {len(iqr_outliers)} rows")
print(f" Normal range: ${lower_bound:.2f} – ${upper_bound:.2f}")
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder
df_ml = df.copy()
le = LabelEncoder()
df_ml['category_enc'] = le.fit_transform(df_ml['category'])
df_ml['region_enc'] = le.fit_transform(df_ml['region'])
df_ml['day_of_week_enc'] = le.fit_transform(df_ml['day_of_week'])
features = ['purchase_amount', 'category_enc', 'region_enc', 'day_of_week_enc']
X = df_ml[features].fillna(0)
iso_forest = IsolationForest(
n_estimators=100,
contamination=0.02,
random_state=42
)
df['anomaly_flag'] = iso_forest.fit_predict(X)
df['anomaly_score'] = iso_forest.score_samples(X)
ml_anomalies = df[df['anomaly_flag'] == -1].copy()
print(f"\nIsolation Forest anomalies (2% contamination): {len(ml_anomalies)} rows")
df['iqr_flag'] = (
(df['purchase_amount'] < lower_bound) |
(df['purchase_amount'] > upper_bound)
).astype(int)
df['zscore_flag'] = (df['z_score'].abs() > 3).astype(int)
df['iso_flag'] = (df['anomaly_flag'] == -1).astype(int)
df['flag_count'] = df['iqr_flag'] + df['zscore_flag'] + df['iso_flag']
df['confirmed_anomaly'] = df['flag_count'] >= 2
confirmed = df[df['confirmed_anomaly']].sort_values('purchase_amount', ascending=False)
print(f"\nCONFIRMED ANOMALIES (2+ methods): {len(confirmed)} rows")
print(confirmed[['date','region','category','purchase_amount','flag_count']].head(20))
print("\n=== ANOMALIES BY REGION ===")
print(confirmed.groupby('region')['purchase_amount'].agg(['count','mean','sum']))
print("\n=== ANOMALIES BY CATEGORY ===")
print(confirmed.groupby('category')['purchase_amount'].agg(['count','mean','sum']))
confirmed.to_csv('flagged_anomalies.csv', index=False)
print("\nAnomaly file saved → flagged_anomalies.csv")
Step 4: Identify the Lowest-Performing Region
region_perf = region_summary.copy()
region_perf['rank_spend'] = region_perf['total_spend'].rank()
region_perf['rank_avg_txn'] = region_perf['avg_transaction'].rank()
region_perf['rank_per_customer']= region_perf['spend_per_customer'].rank()
region_perf['composite_score'] = (
region_perf['rank_spend'] * 0.5 +
region_perf['rank_avg_txn'] * 0.3 +
region_perf['rank_per_customer'] * 0.2
)
lowest_region = region_perf['composite_score'].idxmin()
print(f"\n⚠️ LOWEST PERFORMING REGION: {lowest_region}")
print(region_perf.loc[lowest_region])
low_df = df[df['region'] == lowest_region].copy()
print(f"\n--- {lowest_region} Category Breakdown ---")
print(low_df.groupby('category')['purchase_amount'].agg(['sum','mean','count']))
print(f"\n--- {lowest_region} Monthly Trend ---")
print(low_df.groupby('month')['purchase_amount'].sum())
if 'customer_id' in df.columns:
low_rfm = rfm[rfm.index.isin(low_df['customer_id'])]
at_risk = low_rfm[low_rfm['recency'] > 60]
churned = low_rfm[low_rfm['recency'] > 120]
print(f"\nAt-risk customers (60-120 days silent): {len(at_risk)}")
print(f"Likely churned (120+ days silent): {len(churned)}")
Step 5: Three Actionable Churn-Reduction Insights
if 'customer_id' in df.columns:
reactivation_targets = low_rfm[
(low_rfm['recency'].between(60, 120)) &
(low_rfm['monetary'] > low_rfm['monetary'].median())
].sort_values('monetary', ascending=False)
print("\n=== INSIGHT 1: REACTIVATION TARGETS ===")
print(f"High-value at-risk customers: {len(reactivation_targets)}")
print(f"Avg lifetime spend: ${reactivation_targets['monetary'].mean():.2f}")
print(f"Revenue at risk: ${reactivation_targets['monetary'].sum():,.2f}")
reactivation_targets.to_csv(f'{lowest_region}_reactivation_list.csv')
print("\n=== INSIGHT 2: CATEGORY OPPORTUNITY IN LOWEST REGION ===")
top_region = region_perf['composite_score'].idxmax()
compare = pd.DataFrame({
'lowest_region_pct': (
low_df.groupby('category')['purchase_amount'].sum() /
low_df['purchase_amount'].sum() * 100
),
'top_region_pct': (
df[df['region']==top_region].groupby('category')['purchase_amount'].sum() /
df[df['region']==top_region]['purchase_amount'].sum() * 100
)
}).fillna(0)
compare['gap'] = compare['top_region_pct'] - compare['lowest_region_pct']
compare_sorted = compare.sort_values('gap', ascending=False)
print("Categories UNDERPERFORMING vs top region (promote these):")
print(compare_sorted[compare_sorted['gap'] > 5].round(2))
print("\n=== INSIGHT 3: TIMING OPTIMIZATION ===")
timing = low_df.groupby('day_of_week').agg(
transactions = ('purchase_amount', 'count'),
avg_spend = ('purchase_amount', 'mean'),
total_spend = ('purchase_amount', 'sum')
).sort_values('avg_spend', ascending=False)
print("Best days to run promotions (by avg spend):")
print(timing.round(2))
best_day = timing['avg_spend'].idxmax()
worst_day = timing['avg_spend'].idxmin()
lift = (timing.loc[best_day,'avg_spend'] / timing.loc[worst_day,'avg_spend'] - 1) * 100
print(f"\nRunning campaigns on {best_day} vs {worst_day} = {lift:.1f}% higher avg spend")
Step 6: Visualization Dashboard
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
fig = plt.figure(figsize=(20, 16))
fig.suptitle('Customer Transaction Analysis Dashboard', fontsize=16, fontweight='bold')
gs = gridspec.GridSpec(3, 3, figure=fig, hspace=0.45, wspace=0.35)
ax1 = fig.add_subplot(gs[0, 0])
colors = ['#e74c3c' if r == lowest_region else '#3498db'
for r in region_summary.index]
region_summary['total_spend'].plot(kind='bar', ax=ax1, color=colors)
ax1.set_title('Total Spend by Region\n(red = lowest performer)')
ax1.set_ylabel('Total Spend ($)')
ax1.tick_params(axis='x', rotation=45)
ax2 = fig.add_subplot(gs[0, 1])
category_summary['total_spend'].plot(kind='pie', ax=ax2, autopct='%1.1f%%')
ax2.set_title('Spend Share by Category')
ax2.set_ylabel('')
ax3 = fig.add_subplot(gs[0, 2])
monthly_spend['total_spend'].plot(ax=ax3, marker='o', linewidth=2, color='#2ecc71')
ax3.set_title('Monthly Spend Trend')
ax3.set_ylabel('Total Spend ($)')
ax3.tick_params(axis='x', rotation=45)
ax4 = fig.add_subplot(gs[1, :2])
sns.heatmap(pivot, annot=True, fmt='.0f', cmap='YlOrRd', ax=ax4)
ax4.set_title('Spend Heatmap: Region × Category')
ax5 = fig.add_subplot(gs[1, 2])
df['purchase_amount'].hist(bins=50, ax=ax5, color='#95a5a6', edgecolor='white')
ax5.axvline(upper_bound, color='red', linestyle='--', label=f'IQR Upper: ${upper_bound:.0f}')
ax5.axvline(lower_bound, color='orange', linestyle='--', label=f'IQR Lower: ${lower_bound:.0f}')
ax5.set_title('Transaction Distribution + Bounds')
ax5.legend(fontsize=8)
ax6 = fig.add_subplot(gs[2, 0])
seg_summary['count'].plot(kind='barh', ax=ax6, color='#9b59b6')
ax6.set_title('Customer Segments (RFM)')
ax7 = fig.add_subplot(gs[2, 1])
day_order = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
timing_ordered = timing.reindex(
[d for d in day_order if d in timing.index]
)
timing_ordered['avg_spend'].plot(kind='bar', ax=ax7, color='#e67e22')
ax7.set_title(f'{lowest_region} — Avg Spend by Day\n(target high days for campaigns)')
ax7.tick_params(axis='x', rotation=45)
ax8 = fig.add_subplot(gs[2, 2])
compare_sorted['gap'].plot(kind='barh', ax=ax8,
color=['#e74c3c' if x > 0 else '#2ecc71' for x in compare_sorted['gap']])
ax8.axvline(0, color='black', linewidth=0.8)
ax8.set_title(f'{lowest_region} vs {top_region}\nCategory Gap (%)')
plt.savefig('transaction_analysis.png', dpi=150, bbox_inches='tight')
plt.show()
print("Dashboard saved → transaction_analysis.png")
Step 7: Executive Summary (Auto-Generated)
print("\n" + "="*65)
print("EXECUTIVE SUMMARY")
print("="*65)
total_spend = df['purchase_amount'].sum()
total_txns = len(df)
top_cat = category_summary['total_spend'].idxmax()
top_region = region_perf['composite_score'].idxmax()
print(f"""
📊 PORTFOLIO OVERVIEW
Total Transactions : {total_txns:,}
Total Revenue : ${total_spend:,.2f}
Avg Transaction : ${df['purchase_amount'].mean():.2f}
Date Range : {df['date'].min().date()} → {df['date'].max().date()}
🏆 TOP PERFORMERS
Best Region : {top_region} ({region_summary.loc[top_region,'pct_of_total']:.1f}% of spend)
Best Category : {top_cat} ({category_summary.loc[top_cat,'pct_of_total']:.1f}% of spend)
⚠️ ANOMALIES DETECTED
Confirmed anomalies : {confirmed['confirmed_anomaly'].sum():,} transactions
Revenue in anomalies: ${confirmed['purchase_amount'].sum():,.2f}
Primary hotspot : {confirmed.groupby('region').size().idxmax()} region
🔻 LOWEST PERFORMING REGION: {lowest_region}
Total Spend : ${region_summary.loc[lowest_region,'total_spend']:,.2f}
Avg Transaction : ${region_summary.loc[lowest_region,'avg_transaction']:.2f}
% of Portfolio : {region_summary.loc[lowest_region,'pct_of_total']:.1f}%
""")
print("💡 THREE CHURN-REDUCTION RECOMMENDATIONS")
print("-"*65)
print(f"""
1. REACTIVATE HIGH-VALUE DORMANT CUSTOMERS
Who : {len(reactivation_targets):,} customers in {lowest_region} silent 60–120 days
How : Personalized email/SMS with 15% discount on their most-
purchased category (use RFM segment data)
Goal : Recover ${reactivation_targets['monetary'].sum():,.2f} at-risk revenue
KPI : Reactivation rate > 12% within 30 days
2. CLOSE THE CATEGORY GAP
What : {compare_sorted[compare_sorted['gap']>5].index[0] if len(compare_sorted[compare_sorted['gap']>5])>0 else 'Top category'} is {compare_sorted['gap'].max():.1f}pp UNDER-indexed vs {top_region}
How : Launch targeted category promotions + bundle offers
specifically in {lowest_region}; A/B test 10% vs 20% discount
Goal : Bring category mix within 5pp of top region in 90 days
KPI : Category revenue +25% in {lowest_region}
3. SHIFT CAMPAIGNS TO HIGH-SPEND DAYS
What : {best_day} spend is {lift:.1f}% higher than {worst_day} in {lowest_region}
How : Concentrate flash sales, push notifications, and loyalty
point events on {best_day}; reduce spend on {worst_day}
Goal : Lift average transaction value by 10% with same budget
KPI : Weekly avg spend per active customer +10% in 60 days
""")
print("="*65)
Quick-Start Checklist
| Step |
Action |
Time |
| ✅ |
Adjust column name mapping in Step 1 |
2 min |
| ✅ |
Add customer_id column if available for RFM |
— |
| ✅ |
Tune contamination in Isolation Forest (0.01–0.05) |
1 min |
| ✅ |
Adjust churn thresholds (60/120 days) to your business |
1 min |
| ✅ |
Review flagged anomalies CSV before acting on them |
15 min |
| ✅ |
Share dashboard PNG with stakeholders |
— |
Critical note on anomalies: Not all flagged rows are fraud — large B2B orders, seasonal spikes, and promotional events trigger flags too. Always apply business context before escalating.