diff --git a/.ipynb_checkpoints/main-checkpoint.ipynb b/.ipynb_checkpoints/main-checkpoint.ipynb
index d4445a1..d0ae983 100644
--- a/.ipynb_checkpoints/main-checkpoint.ipynb
+++ b/.ipynb_checkpoints/main-checkpoint.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 4,
"id": "e706cfb0-2234-4c4c-95d8-d1968f656aa0",
"metadata": {
"tags": []
@@ -57,7 +57,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 6,
"id": "134d0b3d-5481-4975-af07-c80ab09d6dd2",
"metadata": {
"tags": []
@@ -157,7 +157,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 34,
"id": "d220561a-34c9-48d2-8e2f-5d174a87540b",
"metadata": {
"tags": []
@@ -180,247 +180,79 @@
" \"Debit_transaction_amount\", \"Total_no_of_debit_transactions\",\n",
" \"Wash_Ratio\", \"Segment\", \"Risk\", \"SAR_FLAG\"]\n",
" df = pd.DataFrame(row_list, columns = cols)\n",
+ " \n",
+ " # Step 1: Compute 90th percentiles per Segment for all 3 fields\n",
+ " percentiles = (\n",
+ " df.groupby(\"Segment\")[[\"Credit_transaction_amount\",\n",
+ " \"Debit_transaction_amount\",\n",
+ " \"Wash_Ratio\"]]\n",
+ " .quantile(0.98)\n",
+ " .reset_index()\n",
+ " )\n",
+ "\n",
+ " # Rename columns for clarity\n",
+ " percentiles = percentiles.rename(columns={\n",
+ " \"Credit_transaction_amount\": \"P90_Credit\",\n",
+ " \"Debit_transaction_amount\": \"P90_Debit\",\n",
+ " \"Wash_Ratio\": \"P90_Wash\"\n",
+ " })\n",
+ "\n",
+ " # Step 2: Merge back to main df\n",
+ " df = df.merge(percentiles, on=\"Segment\", how=\"left\")\n",
+ "\n",
+ " # Step 3: Identify customers above 90th percentile in ANY of the 3 metrics\n",
+ " high_pop = (\n",
+ " (df[\"Credit_transaction_amount\"] > df[\"P90_Credit\"]) |\n",
+ " (df[\"Debit_transaction_amount\"] > df[\"P90_Debit\"]) |\n",
+ " (df[\"Wash_Ratio\"] > df[\"P90_Wash\"])\n",
+ " )\n",
+ "\n",
+ " # Step 4: Randomly select 0.1% sample from high-risk population\n",
+ " sample_fraction = 0.1 # 0.1%\n",
+ " high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index\n",
+ "\n",
+ " # Step 5: Set SAR_FLAG values\n",
+ " df[\"SAR_FLAG\"] = \"N\" # default for all\n",
+ " df.loc[high_pop_indices, \"SAR_FLAG\"] = \"Y\" # assign Y to 0.1% random high-risk population\n",
" return df"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 40,
"id": "2e5a0ea9-64cd-4a8d-9a5d-e5e7b36a401a",
"metadata": {
"tags": []
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Focal_id | \n",
- " Credit_transaction_amount | \n",
- " Total_no_of_credit_transactions | \n",
- " Debit_transaction_amount | \n",
- " Total_no_of_debit_transactions | \n",
- " Wash_Ratio | \n",
- " Segment | \n",
- " Risk | \n",
- " SAR_FLAG | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " PN808624 | \n",
- " 4.601504e+09 | \n",
- " 3239 | \n",
- " 4.461280e+09 | \n",
- " 3129 | \n",
- " 1.031431 | \n",
- " Corporate Banking | \n",
- " Medium Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " PN663041 | \n",
- " 2.106224e+09 | \n",
- " 1573 | \n",
- " 2.281829e+09 | \n",
- " 1563 | \n",
- " 0.923042 | \n",
- " Corporate Banking | \n",
- " Low Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " PN525913 | \n",
- " 1.057799e+09 | \n",
- " 776 | \n",
- " 1.223876e+09 | \n",
- " 850 | \n",
- " 0.864302 | \n",
- " Whole Sale Banking | \n",
- " Low Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " PN440274 | \n",
- " 4.806265e+09 | \n",
- " 3506 | \n",
- " 4.972813e+09 | \n",
- " 3599 | \n",
- " 0.966508 | \n",
- " Whole Sale Banking | \n",
- " Medium Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " PN213026 | \n",
- " 3.982349e+09 | \n",
- " 2809 | \n",
- " 4.122674e+09 | \n",
- " 2783 | \n",
- " 0.965963 | \n",
- " Whole Sale Banking | \n",
- " Medium Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 10009 | \n",
- " PN774741 | \n",
- " 3.373466e+07 | \n",
- " 250 | \n",
- " 2.443148e+07 | \n",
- " 381 | \n",
- " 1.380787 | \n",
- " Priority Banking | \n",
- " Medium Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | 10010 | \n",
- " PN868326 | \n",
- " 3.785344e+07 | \n",
- " 259 | \n",
- " 2.408309e+07 | \n",
- " 352 | \n",
- " 1.571785 | \n",
- " Ultra High NetWorth | \n",
- " Medium Risk | \n",
- " Y | \n",
- "
\n",
- " \n",
- " | 10011 | \n",
- " PN667837 | \n",
- " 3.330357e+07 | \n",
- " 256 | \n",
- " 2.676301e+07 | \n",
- " 359 | \n",
- " 1.244388 | \n",
- " Mass Market | \n",
- " Medium Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | 10012 | \n",
- " PN809566 | \n",
- " 3.890076e+07 | \n",
- " 276 | \n",
- " 2.554121e+07 | \n",
- " 400 | \n",
- " 1.523059 | \n",
- " Others | \n",
- " Low Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | 10013 | \n",
- " PN739647 | \n",
- " 3.505184e+07 | \n",
- " 223 | \n",
- " 2.232980e+07 | \n",
- " 381 | \n",
- " 1.569734 | \n",
- " Others | \n",
- " Low Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- "
\n",
- "
10014 rows × 9 columns
\n",
- "
"
- ],
- "text/plain": [
- " Focal_id Credit_transaction_amount Total_no_of_credit_transactions \\\n",
- "0 PN808624 4.601504e+09 3239 \n",
- "1 PN663041 2.106224e+09 1573 \n",
- "2 PN525913 1.057799e+09 776 \n",
- "3 PN440274 4.806265e+09 3506 \n",
- "4 PN213026 3.982349e+09 2809 \n",
- "... ... ... ... \n",
- "10009 PN774741 3.373466e+07 250 \n",
- "10010 PN868326 3.785344e+07 259 \n",
- "10011 PN667837 3.330357e+07 256 \n",
- "10012 PN809566 3.890076e+07 276 \n",
- "10013 PN739647 3.505184e+07 223 \n",
- "\n",
- " Debit_transaction_amount Total_no_of_debit_transactions Wash_Ratio \\\n",
- "0 4.461280e+09 3129 1.031431 \n",
- "1 2.281829e+09 1563 0.923042 \n",
- "2 1.223876e+09 850 0.864302 \n",
- "3 4.972813e+09 3599 0.966508 \n",
- "4 4.122674e+09 2783 0.965963 \n",
- "... ... ... ... \n",
- "10009 2.443148e+07 381 1.380787 \n",
- "10010 2.408309e+07 352 1.571785 \n",
- "10011 2.676301e+07 359 1.244388 \n",
- "10012 2.554121e+07 400 1.523059 \n",
- "10013 2.232980e+07 381 1.569734 \n",
- "\n",
- " Segment Risk SAR_FLAG \n",
- "0 Corporate Banking Medium Risk N \n",
- "1 Corporate Banking Low Risk N \n",
- "2 Whole Sale Banking Low Risk N \n",
- "3 Whole Sale Banking Medium Risk N \n",
- "4 Whole Sale Banking Medium Risk N \n",
- "... ... ... ... \n",
- "10009 Priority Banking Medium Risk N \n",
- "10010 Ultra High NetWorth Medium Risk Y \n",
- "10011 Mass Market Medium Risk N \n",
- "10012 Others Low Risk N \n",
- "10013 Others Low Risk N \n",
- "\n",
- "[10014 rows x 9 columns]"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"# sen = Scenario()\n",
- "# sen.logic()"
+ "# a = sen.logic()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "150bb5ce-6be1-44fc-a606-6d375354626d",
- "metadata": {},
+ "execution_count": 39,
+ "id": "830c7ec3-9707-46db-9b27-ac4f9d46a03a",
+ "metadata": {
+ "tags": []
+ },
"outputs": [],
- "source": []
+ "source": [
+ "# a"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "150bb5ce-6be1-44fc-a606-6d375354626d",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# a[a[\"SAR_FLAG\"] == \"Y\"]"
+ ]
}
],
"metadata": {
diff --git a/main.ipynb b/main.ipynb
index d4445a1..d0ae983 100644
--- a/main.ipynb
+++ b/main.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 4,
"id": "e706cfb0-2234-4c4c-95d8-d1968f656aa0",
"metadata": {
"tags": []
@@ -57,7 +57,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 6,
"id": "134d0b3d-5481-4975-af07-c80ab09d6dd2",
"metadata": {
"tags": []
@@ -157,7 +157,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 34,
"id": "d220561a-34c9-48d2-8e2f-5d174a87540b",
"metadata": {
"tags": []
@@ -180,247 +180,79 @@
" \"Debit_transaction_amount\", \"Total_no_of_debit_transactions\",\n",
" \"Wash_Ratio\", \"Segment\", \"Risk\", \"SAR_FLAG\"]\n",
" df = pd.DataFrame(row_list, columns = cols)\n",
+ " \n",
+ " # Step 1: Compute 90th percentiles per Segment for all 3 fields\n",
+ " percentiles = (\n",
+ " df.groupby(\"Segment\")[[\"Credit_transaction_amount\",\n",
+ " \"Debit_transaction_amount\",\n",
+ " \"Wash_Ratio\"]]\n",
+ " .quantile(0.98)\n",
+ " .reset_index()\n",
+ " )\n",
+ "\n",
+ " # Rename columns for clarity\n",
+ " percentiles = percentiles.rename(columns={\n",
+ " \"Credit_transaction_amount\": \"P90_Credit\",\n",
+ " \"Debit_transaction_amount\": \"P90_Debit\",\n",
+ " \"Wash_Ratio\": \"P90_Wash\"\n",
+ " })\n",
+ "\n",
+ " # Step 2: Merge back to main df\n",
+ " df = df.merge(percentiles, on=\"Segment\", how=\"left\")\n",
+ "\n",
+ " # Step 3: Identify customers above 90th percentile in ANY of the 3 metrics\n",
+ " high_pop = (\n",
+ " (df[\"Credit_transaction_amount\"] > df[\"P90_Credit\"]) |\n",
+ " (df[\"Debit_transaction_amount\"] > df[\"P90_Debit\"]) |\n",
+ " (df[\"Wash_Ratio\"] > df[\"P90_Wash\"])\n",
+ " )\n",
+ "\n",
+ " # Step 4: Randomly select 0.1% sample from high-risk population\n",
+ " sample_fraction = 0.1 # 0.1%\n",
+ " high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index\n",
+ "\n",
+ " # Step 5: Set SAR_FLAG values\n",
+ " df[\"SAR_FLAG\"] = \"N\" # default for all\n",
+ " df.loc[high_pop_indices, \"SAR_FLAG\"] = \"Y\" # assign Y to 0.1% random high-risk population\n",
" return df"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 40,
"id": "2e5a0ea9-64cd-4a8d-9a5d-e5e7b36a401a",
"metadata": {
"tags": []
},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Focal_id | \n",
- " Credit_transaction_amount | \n",
- " Total_no_of_credit_transactions | \n",
- " Debit_transaction_amount | \n",
- " Total_no_of_debit_transactions | \n",
- " Wash_Ratio | \n",
- " Segment | \n",
- " Risk | \n",
- " SAR_FLAG | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " PN808624 | \n",
- " 4.601504e+09 | \n",
- " 3239 | \n",
- " 4.461280e+09 | \n",
- " 3129 | \n",
- " 1.031431 | \n",
- " Corporate Banking | \n",
- " Medium Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " PN663041 | \n",
- " 2.106224e+09 | \n",
- " 1573 | \n",
- " 2.281829e+09 | \n",
- " 1563 | \n",
- " 0.923042 | \n",
- " Corporate Banking | \n",
- " Low Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " PN525913 | \n",
- " 1.057799e+09 | \n",
- " 776 | \n",
- " 1.223876e+09 | \n",
- " 850 | \n",
- " 0.864302 | \n",
- " Whole Sale Banking | \n",
- " Low Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " PN440274 | \n",
- " 4.806265e+09 | \n",
- " 3506 | \n",
- " 4.972813e+09 | \n",
- " 3599 | \n",
- " 0.966508 | \n",
- " Whole Sale Banking | \n",
- " Medium Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " PN213026 | \n",
- " 3.982349e+09 | \n",
- " 2809 | \n",
- " 4.122674e+09 | \n",
- " 2783 | \n",
- " 0.965963 | \n",
- " Whole Sale Banking | \n",
- " Medium Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 10009 | \n",
- " PN774741 | \n",
- " 3.373466e+07 | \n",
- " 250 | \n",
- " 2.443148e+07 | \n",
- " 381 | \n",
- " 1.380787 | \n",
- " Priority Banking | \n",
- " Medium Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | 10010 | \n",
- " PN868326 | \n",
- " 3.785344e+07 | \n",
- " 259 | \n",
- " 2.408309e+07 | \n",
- " 352 | \n",
- " 1.571785 | \n",
- " Ultra High NetWorth | \n",
- " Medium Risk | \n",
- " Y | \n",
- "
\n",
- " \n",
- " | 10011 | \n",
- " PN667837 | \n",
- " 3.330357e+07 | \n",
- " 256 | \n",
- " 2.676301e+07 | \n",
- " 359 | \n",
- " 1.244388 | \n",
- " Mass Market | \n",
- " Medium Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | 10012 | \n",
- " PN809566 | \n",
- " 3.890076e+07 | \n",
- " 276 | \n",
- " 2.554121e+07 | \n",
- " 400 | \n",
- " 1.523059 | \n",
- " Others | \n",
- " Low Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- " | 10013 | \n",
- " PN739647 | \n",
- " 3.505184e+07 | \n",
- " 223 | \n",
- " 2.232980e+07 | \n",
- " 381 | \n",
- " 1.569734 | \n",
- " Others | \n",
- " Low Risk | \n",
- " N | \n",
- "
\n",
- " \n",
- "
\n",
- "
10014 rows × 9 columns
\n",
- "
"
- ],
- "text/plain": [
- " Focal_id Credit_transaction_amount Total_no_of_credit_transactions \\\n",
- "0 PN808624 4.601504e+09 3239 \n",
- "1 PN663041 2.106224e+09 1573 \n",
- "2 PN525913 1.057799e+09 776 \n",
- "3 PN440274 4.806265e+09 3506 \n",
- "4 PN213026 3.982349e+09 2809 \n",
- "... ... ... ... \n",
- "10009 PN774741 3.373466e+07 250 \n",
- "10010 PN868326 3.785344e+07 259 \n",
- "10011 PN667837 3.330357e+07 256 \n",
- "10012 PN809566 3.890076e+07 276 \n",
- "10013 PN739647 3.505184e+07 223 \n",
- "\n",
- " Debit_transaction_amount Total_no_of_debit_transactions Wash_Ratio \\\n",
- "0 4.461280e+09 3129 1.031431 \n",
- "1 2.281829e+09 1563 0.923042 \n",
- "2 1.223876e+09 850 0.864302 \n",
- "3 4.972813e+09 3599 0.966508 \n",
- "4 4.122674e+09 2783 0.965963 \n",
- "... ... ... ... \n",
- "10009 2.443148e+07 381 1.380787 \n",
- "10010 2.408309e+07 352 1.571785 \n",
- "10011 2.676301e+07 359 1.244388 \n",
- "10012 2.554121e+07 400 1.523059 \n",
- "10013 2.232980e+07 381 1.569734 \n",
- "\n",
- " Segment Risk SAR_FLAG \n",
- "0 Corporate Banking Medium Risk N \n",
- "1 Corporate Banking Low Risk N \n",
- "2 Whole Sale Banking Low Risk N \n",
- "3 Whole Sale Banking Medium Risk N \n",
- "4 Whole Sale Banking Medium Risk N \n",
- "... ... ... ... \n",
- "10009 Priority Banking Medium Risk N \n",
- "10010 Ultra High NetWorth Medium Risk Y \n",
- "10011 Mass Market Medium Risk N \n",
- "10012 Others Low Risk N \n",
- "10013 Others Low Risk N \n",
- "\n",
- "[10014 rows x 9 columns]"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"# sen = Scenario()\n",
- "# sen.logic()"
+ "# a = sen.logic()"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "150bb5ce-6be1-44fc-a606-6d375354626d",
- "metadata": {},
+ "execution_count": 39,
+ "id": "830c7ec3-9707-46db-9b27-ac4f9d46a03a",
+ "metadata": {
+ "tags": []
+ },
"outputs": [],
- "source": []
+ "source": [
+ "# a"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "id": "150bb5ce-6be1-44fc-a606-6d375354626d",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# a[a[\"SAR_FLAG\"] == \"Y\"]"
+ ]
}
],
"metadata": {
diff --git a/main.py b/main.py
index 2f5e9af..98e9fe8 100644
--- a/main.py
+++ b/main.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# coding: utf-8
-# In[1]:
+# In[4]:
import pandas as pd
@@ -20,7 +20,7 @@ seq = SQLQueryInterface(schema="transactionschema")
seq.execute_raw("show tables")
-# In[7]:
+# In[6]:
query = """
@@ -114,7 +114,7 @@ query = """
"""
-# In[8]:
+# In[34]:
from tms_data_interface import SQLQueryInterface
@@ -133,18 +133,58 @@ class Scenario:
"Debit_transaction_amount", "Total_no_of_debit_transactions",
"Wash_Ratio", "Segment", "Risk", "SAR_FLAG"]
df = pd.DataFrame(row_list, columns = cols)
+
+ # Step 1: Compute 90th percentiles per Segment for all 3 fields
+ percentiles = (
+ df.groupby("Segment")[["Credit_transaction_amount",
+ "Debit_transaction_amount",
+ "Wash_Ratio"]]
+ .quantile(0.98)
+ .reset_index()
+ )
+
+ # Rename columns for clarity
+ percentiles = percentiles.rename(columns={
+ "Credit_transaction_amount": "P90_Credit",
+ "Debit_transaction_amount": "P90_Debit",
+ "Wash_Ratio": "P90_Wash"
+ })
+
+ # Step 2: Merge back to main df
+ df = df.merge(percentiles, on="Segment", how="left")
+
+ # Step 3: Identify customers above 90th percentile in ANY of the 3 metrics
+ high_pop = (
+ (df["Credit_transaction_amount"] > df["P90_Credit"]) |
+ (df["Debit_transaction_amount"] > df["P90_Debit"]) |
+ (df["Wash_Ratio"] > df["P90_Wash"])
+ )
+
+ # Step 4: Randomly select 0.1% sample from high-risk population
+ sample_fraction = 0.1 # 0.1%
+ high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index
+
+ # Step 5: Set SAR_FLAG values
+ df["SAR_FLAG"] = "N" # default for all
+ df.loc[high_pop_indices, "SAR_FLAG"] = "Y" # assign Y to 0.1% random high-risk population
return df
-# In[9]:
+# In[40]:
# sen = Scenario()
-# sen.logic()
+# a = sen.logic()
-# In[ ]:
+# In[39]:
+# a
+# In[38]:
+
+
+# a[a["SAR_FLAG"] == "Y"]
+