diff --git a/.ipynb_checkpoints/main-checkpoint.ipynb b/.ipynb_checkpoints/main-checkpoint.ipynb index 90f3c1f..c5ecb92 100644 --- a/.ipynb_checkpoints/main-checkpoint.ipynb +++ b/.ipynb_checkpoints/main-checkpoint.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "id": "e706cfb0-2234-4c4c-95d8-d1968f656aa0", "metadata": { "tags": [] @@ -89,104 +89,46 @@ " \"Segment\", \"Risk\", \"SAR_FLAG\"]\n", " df = pd.DataFrame(row_list, columns = cols)\n", " df[\"Cash_deposit_total\"] = df[\"Cash_deposit_total\"].astype(float)\n", + " \n", + " \n", + "\n", + " # Step 1: Compute 90th percentiles per Segment for all 3 fields\n", + " percentiles = (\n", + " df.groupby(\"Segment\")[[\"Cash_deposit_total\",\n", + " \"Cash_deposit_count\"]]\n", + " .quantile(0.98)\n", + " .reset_index()\n", + " )\n", + "\n", + " # Rename columns for clarity\n", + " percentiles = percentiles.rename(columns={\n", + " \"Cash_deposit_total\": \"P90_Credit\",\n", + " \"Cash_deposit_count\": \"P90_Credit_count\"\n", + " })\n", + "\n", + " # Step 2: Merge back to main df\n", + " df = df.merge(percentiles, on=\"Segment\", how=\"left\")\n", + "\n", + " # Step 3: Identify customers above 90th percentile in ANY of the 3 metrics\n", + " high_pop = (\n", + " (df[\"Cash_deposit_total\"] > df[\"P90_Credit\"]) &\n", + " (df[\"Cash_deposit_count\"] > df[\"P90_Credit_count\"])\n", + " )\n", + "\n", + " # Step 4: Randomly select 0.1% sample from high-risk population\n", + " sample_fraction = 0.1 # 0.1%\n", + " high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index\n", + "\n", + " # Step 5: Set SAR_FLAG values\n", + " df[\"SAR_FLAG\"] = \"N\" # default for all\n", + " df.loc[high_pop_indices, \"SAR_FLAG\"] = \"Y\" # assign Y to 0.1% random high-risk population\n", + "\n", " return df" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "b6c85de2-6a47-4109-8885-c138c289ec25", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# import pandas as pd\n", - "\n", - "# query = \"\"\"\n", - "# SELECT \n", - "# t.transaction_id,\n", - "# t.transaction_date,\n", - "# t.transaction_amount,\n", - "# t.transaction_desc,\n", - "# t.benef_account_number,\n", - "\n", - "# -- Account data\n", - "# a.account_number,\n", - "# a.customer_number AS acc_customer_number,\n", - "# a.account_type,\n", - "# a.branch_code,\n", - "\n", - "# -- Party data\n", - "# p.customer_number AS party_customer_number,\n", - "# p.customer_name,\n", - "# p.date_of_birth,\n", - "# p.nationality,\n", - "# p.business_segment,\n", - "# CASE\n", - "# WHEN p.risk_classification = 1 THEN 'Low Risk'\n", - "# WHEN p.risk_classification = 2 THEN 'Medium Risk'\n", - "# WHEN p.risk_classification = 3 THEN 'High Risk'\n", - "# ELSE 'Unknown Risk'\n", - "# END AS risk_level,\n", - "\n", - "# -- Alert data\n", - "# COALESCE(al.sar_flag, 'N') AS sar_flag\n", - "\n", - "# FROM {trans_data} t\n", - "\n", - "# -- Join with account data on beneficiary account\n", - "# LEFT JOIN {acc_data} a\n", - "# ON t.benef_account_number = a.account_number\n", - "\n", - "# -- Join with party/customer data using account's customer number\n", - "# LEFT JOIN {cust_data} p\n", - "# ON a.customer_number = p.customer_number\n", - "\n", - "# -- Join with alert data using party's customer number\n", - "# LEFT JOIN {alert_data} al\n", - "# ON p.customer_number = al.customer_number\n", - "\n", - "# WHERE a.account_number IS NOT NULL\n", - "# limit 100\n", - "# \"\"\"\n", - "\n", - "# from tms_data_interface import SQLQueryInterface\n", - "\n", - "# class Scenario:\n", - "# seq = SQLQueryInterface(schema=\"transactionschema\")\n", - "\n", - "# def logic(self, **kwargs):\n", - "# row_list = self.seq.execute_raw(query.format(trans_data=\"transaction10m\",\n", - "# cust_data=\"customer_data_v1\",\n", - "# acc_data=\"account_data_v1\",\n", - "# alert_data=\"alert_data_v1\")\n", - "# )\n", - "# cols = [\n", - "# \"transaction_id\",\n", - "# \"transaction_date\",\n", - "# \"transaction_amount\",\n", - "# \"transaction_desc\",\n", - "# \"benef_account_number\",\n", - "# \"account_number\",\n", - "# \"acc_customer_number\",\n", - "# \"account_type\",\n", - "# \"branch_code\",\n", - "# \"party_customer_number\",\n", - "# \"customer_name\",\n", - "# \"date_of_birth\",\n", - "# \"nationality\",\n", - "# \"business_segment\",\n", - "# \"risk_level\",\n", - "# \"sar_flag\"\n", - "# ]\n", - "# df = pd.DataFrame(row_list, columns = cols)\n", - "# return df" - ] - }, - { - "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "1f20337b-8116-47e5-8743-1ba41e2df819", "metadata": { "tags": [] @@ -194,16 +136,20 @@ "outputs": [], "source": [ "# sen = Scenario()\n", - "# sen.logic()" + "# a = sen.logic()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "6de62b37-00d1-4c88-b27b-9a70e05add91", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], - "source": [] + "source": [ + "# a[a[\"SAR_FLAG\"] == \"Y\"]" + ] } ], "metadata": { diff --git a/main.ipynb b/main.ipynb index 90f3c1f..c5ecb92 100644 --- a/main.ipynb +++ b/main.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "id": "e706cfb0-2234-4c4c-95d8-d1968f656aa0", "metadata": { "tags": [] @@ -89,104 +89,46 @@ " \"Segment\", \"Risk\", \"SAR_FLAG\"]\n", " df = pd.DataFrame(row_list, columns = cols)\n", " df[\"Cash_deposit_total\"] = df[\"Cash_deposit_total\"].astype(float)\n", + " \n", + " \n", + "\n", + " # Step 1: Compute 90th percentiles per Segment for all 3 fields\n", + " percentiles = (\n", + " df.groupby(\"Segment\")[[\"Cash_deposit_total\",\n", + " \"Cash_deposit_count\"]]\n", + " .quantile(0.98)\n", + " .reset_index()\n", + " )\n", + "\n", + " # Rename columns for clarity\n", + " percentiles = percentiles.rename(columns={\n", + " \"Cash_deposit_total\": \"P90_Credit\",\n", + " \"Cash_deposit_count\": \"P90_Credit_count\"\n", + " })\n", + "\n", + " # Step 2: Merge back to main df\n", + " df = df.merge(percentiles, on=\"Segment\", how=\"left\")\n", + "\n", + " # Step 3: Identify customers above 90th percentile in ANY of the 3 metrics\n", + " high_pop = (\n", + " (df[\"Cash_deposit_total\"] > df[\"P90_Credit\"]) &\n", + " (df[\"Cash_deposit_count\"] > df[\"P90_Credit_count\"])\n", + " )\n", + "\n", + " # Step 4: Randomly select 0.1% sample from high-risk population\n", + " sample_fraction = 0.1 # 0.1%\n", + " high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index\n", + "\n", + " # Step 5: Set SAR_FLAG values\n", + " df[\"SAR_FLAG\"] = \"N\" # default for all\n", + " df.loc[high_pop_indices, \"SAR_FLAG\"] = \"Y\" # assign Y to 0.1% random high-risk population\n", + "\n", " return df" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "b6c85de2-6a47-4109-8885-c138c289ec25", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# import pandas as pd\n", - "\n", - "# query = \"\"\"\n", - "# SELECT \n", - "# t.transaction_id,\n", - "# t.transaction_date,\n", - "# t.transaction_amount,\n", - "# t.transaction_desc,\n", - "# t.benef_account_number,\n", - "\n", - "# -- Account data\n", - "# a.account_number,\n", - "# a.customer_number AS acc_customer_number,\n", - "# a.account_type,\n", - "# a.branch_code,\n", - "\n", - "# -- Party data\n", - "# p.customer_number AS party_customer_number,\n", - "# p.customer_name,\n", - "# p.date_of_birth,\n", - "# p.nationality,\n", - "# p.business_segment,\n", - "# CASE\n", - "# WHEN p.risk_classification = 1 THEN 'Low Risk'\n", - "# WHEN p.risk_classification = 2 THEN 'Medium Risk'\n", - "# WHEN p.risk_classification = 3 THEN 'High Risk'\n", - "# ELSE 'Unknown Risk'\n", - "# END AS risk_level,\n", - "\n", - "# -- Alert data\n", - "# COALESCE(al.sar_flag, 'N') AS sar_flag\n", - "\n", - "# FROM {trans_data} t\n", - "\n", - "# -- Join with account data on beneficiary account\n", - "# LEFT JOIN {acc_data} a\n", - "# ON t.benef_account_number = a.account_number\n", - "\n", - "# -- Join with party/customer data using account's customer number\n", - "# LEFT JOIN {cust_data} p\n", - "# ON a.customer_number = p.customer_number\n", - "\n", - "# -- Join with alert data using party's customer number\n", - "# LEFT JOIN {alert_data} al\n", - "# ON p.customer_number = al.customer_number\n", - "\n", - "# WHERE a.account_number IS NOT NULL\n", - "# limit 100\n", - "# \"\"\"\n", - "\n", - "# from tms_data_interface import SQLQueryInterface\n", - "\n", - "# class Scenario:\n", - "# seq = SQLQueryInterface(schema=\"transactionschema\")\n", - "\n", - "# def logic(self, **kwargs):\n", - "# row_list = self.seq.execute_raw(query.format(trans_data=\"transaction10m\",\n", - "# cust_data=\"customer_data_v1\",\n", - "# acc_data=\"account_data_v1\",\n", - "# alert_data=\"alert_data_v1\")\n", - "# )\n", - "# cols = [\n", - "# \"transaction_id\",\n", - "# \"transaction_date\",\n", - "# \"transaction_amount\",\n", - "# \"transaction_desc\",\n", - "# \"benef_account_number\",\n", - "# \"account_number\",\n", - "# \"acc_customer_number\",\n", - "# \"account_type\",\n", - "# \"branch_code\",\n", - "# \"party_customer_number\",\n", - "# \"customer_name\",\n", - "# \"date_of_birth\",\n", - "# \"nationality\",\n", - "# \"business_segment\",\n", - "# \"risk_level\",\n", - "# \"sar_flag\"\n", - "# ]\n", - "# df = pd.DataFrame(row_list, columns = cols)\n", - "# return df" - ] - }, - { - "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "1f20337b-8116-47e5-8743-1ba41e2df819", "metadata": { "tags": [] @@ -194,16 +136,20 @@ "outputs": [], "source": [ "# sen = Scenario()\n", - "# sen.logic()" + "# a = sen.logic()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "6de62b37-00d1-4c88-b27b-9a70e05add91", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], - "source": [] + "source": [ + "# a[a[\"SAR_FLAG\"] == \"Y\"]" + ] } ], "metadata": { diff --git a/main.py b/main.py index 7d20451..7fb7723 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # coding: utf-8 -# In[3]: +# In[7]: import pandas as pd @@ -84,104 +84,52 @@ class Scenario: "Segment", "Risk", "SAR_FLAG"] df = pd.DataFrame(row_list, columns = cols) df["Cash_deposit_total"] = df["Cash_deposit_total"].astype(float) + + + + # Step 1: Compute 90th percentiles per Segment for all 3 fields + percentiles = ( + df.groupby("Segment")[["Cash_deposit_total", + "Cash_deposit_count"]] + .quantile(0.98) + .reset_index() + ) + + # Rename columns for clarity + percentiles = percentiles.rename(columns={ + "Cash_deposit_total": "P90_Credit", + "Cash_deposit_count": "P90_Credit_count" + }) + + # Step 2: Merge back to main df + df = df.merge(percentiles, on="Segment", how="left") + + # Step 3: Identify customers above 90th percentile in ANY of the 3 metrics + high_pop = ( + (df["Cash_deposit_total"] > df["P90_Credit"]) & + (df["Cash_deposit_count"] > df["P90_Credit_count"]) + ) + + # Step 4: Randomly select 0.1% sample from high-risk population + sample_fraction = 0.1 # 0.1% + high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index + + # Step 5: Set SAR_FLAG values + df["SAR_FLAG"] = "N" # default for all + df.loc[high_pop_indices, "SAR_FLAG"] = "Y" # assign Y to 0.1% random high-risk population + return df -# In[6]: - - -# import pandas as pd - -# query = """ -# SELECT -# t.transaction_id, -# t.transaction_date, -# t.transaction_amount, -# t.transaction_desc, -# t.benef_account_number, - -# -- Account data -# a.account_number, -# a.customer_number AS acc_customer_number, -# a.account_type, -# a.branch_code, - -# -- Party data -# p.customer_number AS party_customer_number, -# p.customer_name, -# p.date_of_birth, -# p.nationality, -# p.business_segment, -# CASE -# WHEN p.risk_classification = 1 THEN 'Low Risk' -# WHEN p.risk_classification = 2 THEN 'Medium Risk' -# WHEN p.risk_classification = 3 THEN 'High Risk' -# ELSE 'Unknown Risk' -# END AS risk_level, - -# -- Alert data -# COALESCE(al.sar_flag, 'N') AS sar_flag - -# FROM {trans_data} t - -# -- Join with account data on beneficiary account -# LEFT JOIN {acc_data} a -# ON t.benef_account_number = a.account_number - -# -- Join with party/customer data using account's customer number -# LEFT JOIN {cust_data} p -# ON a.customer_number = p.customer_number - -# -- Join with alert data using party's customer number -# LEFT JOIN {alert_data} al -# ON p.customer_number = al.customer_number - -# WHERE a.account_number IS NOT NULL -# limit 100 -# """ - -# from tms_data_interface import SQLQueryInterface - -# class Scenario: -# seq = SQLQueryInterface(schema="transactionschema") - -# def logic(self, **kwargs): -# row_list = self.seq.execute_raw(query.format(trans_data="transaction10m", -# cust_data="customer_data_v1", -# acc_data="account_data_v1", -# alert_data="alert_data_v1") -# ) -# cols = [ -# "transaction_id", -# "transaction_date", -# "transaction_amount", -# "transaction_desc", -# "benef_account_number", -# "account_number", -# "acc_customer_number", -# "account_type", -# "branch_code", -# "party_customer_number", -# "customer_name", -# "date_of_birth", -# "nationality", -# "business_segment", -# "risk_level", -# "sar_flag" -# ] -# df = pd.DataFrame(row_list, columns = cols) -# return df - - -# In[5]: +# In[8]: # sen = Scenario() -# sen.logic() +# a = sen.logic() -# In[ ]: - +# In[10]: +# a[a["SAR_FLAG"] == "Y"]