Initial commit

2026-06-16 16:39:19 +00:00 · 2026-06-16 16:39:19 +00:00 · 35c02db444
commit 35c02db444
3 changed files with 487 additions and 0 deletions
--- a/.ipynb_checkpoints/main-checkpoint.ipynb
+++ b/.ipynb_checkpoints/main-checkpoint.ipynb
@ -0,0 +1,176 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "e706cfb0-2234-4c4c-95d8-d1968f656aa0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "query = \"\"\"\n",
+    "    select  final.CUSTOMER_NUMBER_main as Focal_id,\n",
+    "        CAST(final.Cash_deposit_total AS DECIMAL(18, 2)) AS Cash_deposit_total,\n",
+    "        final.Cash_deposit_count,\n",
+    "        final.SEGMENT,\n",
+    "        final.RISK,\n",
+    "        final.SAR_FLAG\n",
+    "from \n",
+    "(\n",
+    "    (\n",
+    "        select subquery.CUSTOMER_NUMBER_1 as CUSTOMER_NUMBER_main,\n",
+    "               subquery.Cash_deposit_total,\n",
+    "               subquery.Cash_deposit_count\n",
+    "        from \n",
+    "        (\n",
+    "            select customer_number as CUSTOMER_NUMBER_1, \n",
+    "                   sum(transaction_amount) as Cash_deposit_total, \n",
+    "                   count(*) as Cash_deposit_count\n",
+    "            from \n",
+    "            (\n",
+    "                select * \n",
+    "                from {trans_data} trans_table \n",
+    "                left join {acc_data} acc_table\n",
+    "                on trans_table.benef_account_number = acc_table.account_number\n",
+    "            ) trans\n",
+    "            where account_number not in ('None')\n",
+    "            and transaction_desc = 'CASH RELATED TRANSACTION'\n",
+    "            group by customer_number\n",
+    "        ) subquery\n",
+    "    ) main \n",
+    "    left join \n",
+    "   (\n",
+    "        select cd.CUSTOMER_NUMBER_3 as CUSTOMER_NUMBER_cust,\n",
+    "               cd.SEGMENT,\n",
+    "               cd.RISK,\n",
+    "               case\n",
+    "                    when ad.SAR_FLAG is NULL then 'N'\n",
+    "                    else ad.SAR_FLAG\n",
+    "               end as SAR_FLAG    \n",
+    "        from\n",
+    "        (\n",
+    "            select customer_number as CUSTOMER_NUMBER_3, \n",
+    "                   business_segment as SEGMENT,\n",
+    "                   case\n",
+    "                       when RISK_CLASSIFICATION = 1 then 'Low Risk'\n",
+    "                       when RISK_CLASSIFICATION = 2 then 'Medium Risk'\n",
+    "                       when RISK_CLASSIFICATION = 3 then 'High Risk'\n",
+    "                       else 'Unknown Risk'\n",
+    "                   end AS RISK\n",
+    "            from {cust_data}\n",
+    "        ) cd \n",
+    "        left join\n",
+    "        (\n",
+    "            select customer_number as CUSTOMER_NUMBER_4, \n",
+    "                   sar_flag as SAR_FLAG\n",
+    "            from {alert_data}\n",
+    "        ) ad \n",
+    "        on cd.CUSTOMER_NUMBER_3 = ad.CUSTOMER_NUMBER_4\n",
+    "    ) as cust_alert\n",
+    "    on cust_alert.CUSTOMER_NUMBER_cust = main.CUSTOMER_NUMBER_main\n",
+    ") as final\n",
+    "\"\"\"\n",
+    "\n",
+    "from tms_data_interface import SQLQueryInterface\n",
+    "\n",
+    "class Scenario:\n",
+    "    seq = SQLQueryInterface(schema=\"transactionschema\")\n",
+    "\n",
+    "    def logic(self, **kwargs):\n",
+    "        row_list = self.seq.execute_raw(query.format(trans_data=\"transaction10m\",\n",
+    "                                                    cust_data=\"customer_data_v1\",\n",
+    "                                                    acc_data=\"account_data_v1\",\n",
+    "                                                    alert_data=\"alert_data_v1\")\n",
+    "                                       )\n",
+    "        cols = [\"Focal_id\", \"Cash_deposit_total\", \"Cash_deposit_count\",\n",
+    "                \"Segment\", \"Risk\", \"SAR_FLAG\"]\n",
+    "        df = pd.DataFrame(row_list, columns = cols)\n",
+    "        df[\"Cash_deposit_total\"] = df[\"Cash_deposit_total\"].astype(float)\n",
+    "        \n",
+    "        \n",
+    "\n",
+    "        # Step 1: Compute 90th percentiles per Segment for all 3 fields\n",
+    "        percentiles = (\n",
+    "            df.groupby(\"Segment\")[[\"Cash_deposit_total\",\n",
+    "                                   \"Cash_deposit_count\"]]\n",
+    "              .quantile(0.98)\n",
+    "              .reset_index()\n",
+    "        )\n",
+    "\n",
+    "        # Rename columns for clarity\n",
+    "        percentiles = percentiles.rename(columns={\n",
+    "            \"Cash_deposit_total\": \"P90_Credit\",\n",
+    "            \"Cash_deposit_count\": \"P90_Credit_count\"\n",
+    "        })\n",
+    "\n",
+    "        # Step 2: Merge back to main df\n",
+    "        df = df.merge(percentiles, on=\"Segment\", how=\"left\")\n",
+    "\n",
+    "        # Step 3: Identify customers above 90th percentile in ANY of the 3 metrics\n",
+    "        high_pop = (\n",
+    "            (df[\"Cash_deposit_total\"] > df[\"P90_Credit\"]) &\n",
+    "            (df[\"Cash_deposit_count\"] > df[\"P90_Credit_count\"])\n",
+    "        )\n",
+    "\n",
+    "        # Step 4: Randomly select 0.1% sample from high-risk population\n",
+    "        sample_fraction = 0.1   # 0.1%\n",
+    "        high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index\n",
+    "\n",
+    "        # Step 5: Set SAR_FLAG values\n",
+    "        df[\"SAR_FLAG\"] = \"N\"   # default for all\n",
+    "        df.loc[high_pop_indices, \"SAR_FLAG\"] = \"Y\"   # assign Y to 0.1% random high-risk population\n",
+    "\n",
+    "        return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "1f20337b-8116-47e5-8743-1ba41e2df819",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# sen = Scenario()\n",
+    "# a = sen.logic()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "6de62b37-00d1-4c88-b27b-9a70e05add91",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# a[a[\"SAR_FLAG\"] == \"Y\"]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/main.ipynb
+++ b/main.ipynb
@ -0,0 +1,176 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "e706cfb0-2234-4c4c-95d8-d1968f656aa0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "query = \"\"\"\n",
+    "    select  final.CUSTOMER_NUMBER_main as Focal_id,\n",
+    "        CAST(final.Cash_deposit_total AS DECIMAL(18, 2)) AS Cash_deposit_total,\n",
+    "        final.Cash_deposit_count,\n",
+    "        final.SEGMENT,\n",
+    "        final.RISK,\n",
+    "        final.SAR_FLAG\n",
+    "from \n",
+    "(\n",
+    "    (\n",
+    "        select subquery.CUSTOMER_NUMBER_1 as CUSTOMER_NUMBER_main,\n",
+    "               subquery.Cash_deposit_total,\n",
+    "               subquery.Cash_deposit_count\n",
+    "        from \n",
+    "        (\n",
+    "            select customer_number as CUSTOMER_NUMBER_1, \n",
+    "                   sum(transaction_amount) as Cash_deposit_total, \n",
+    "                   count(*) as Cash_deposit_count\n",
+    "            from \n",
+    "            (\n",
+    "                select * \n",
+    "                from {trans_data} trans_table \n",
+    "                left join {acc_data} acc_table\n",
+    "                on trans_table.benef_account_number = acc_table.account_number\n",
+    "            ) trans\n",
+    "            where account_number not in ('None')\n",
+    "            and transaction_desc = 'CASH RELATED TRANSACTION'\n",
+    "            group by customer_number\n",
+    "        ) subquery\n",
+    "    ) main \n",
+    "    left join \n",
+    "   (\n",
+    "        select cd.CUSTOMER_NUMBER_3 as CUSTOMER_NUMBER_cust,\n",
+    "               cd.SEGMENT,\n",
+    "               cd.RISK,\n",
+    "               case\n",
+    "                    when ad.SAR_FLAG is NULL then 'N'\n",
+    "                    else ad.SAR_FLAG\n",
+    "               end as SAR_FLAG    \n",
+    "        from\n",
+    "        (\n",
+    "            select customer_number as CUSTOMER_NUMBER_3, \n",
+    "                   business_segment as SEGMENT,\n",
+    "                   case\n",
+    "                       when RISK_CLASSIFICATION = 1 then 'Low Risk'\n",
+    "                       when RISK_CLASSIFICATION = 2 then 'Medium Risk'\n",
+    "                       when RISK_CLASSIFICATION = 3 then 'High Risk'\n",
+    "                       else 'Unknown Risk'\n",
+    "                   end AS RISK\n",
+    "            from {cust_data}\n",
+    "        ) cd \n",
+    "        left join\n",
+    "        (\n",
+    "            select customer_number as CUSTOMER_NUMBER_4, \n",
+    "                   sar_flag as SAR_FLAG\n",
+    "            from {alert_data}\n",
+    "        ) ad \n",
+    "        on cd.CUSTOMER_NUMBER_3 = ad.CUSTOMER_NUMBER_4\n",
+    "    ) as cust_alert\n",
+    "    on cust_alert.CUSTOMER_NUMBER_cust = main.CUSTOMER_NUMBER_main\n",
+    ") as final\n",
+    "\"\"\"\n",
+    "\n",
+    "from tms_data_interface import SQLQueryInterface\n",
+    "\n",
+    "class Scenario:\n",
+    "    seq = SQLQueryInterface(schema=\"transactionschema\")\n",
+    "\n",
+    "    def logic(self, **kwargs):\n",
+    "        row_list = self.seq.execute_raw(query.format(trans_data=\"transaction10m\",\n",
+    "                                                    cust_data=\"customer_data_v1\",\n",
+    "                                                    acc_data=\"account_data_v1\",\n",
+    "                                                    alert_data=\"alert_data_v1\")\n",
+    "                                       )\n",
+    "        cols = [\"Focal_id\", \"Cash_deposit_total\", \"Cash_deposit_count\",\n",
+    "                \"Segment\", \"Risk\", \"SAR_FLAG\"]\n",
+    "        df = pd.DataFrame(row_list, columns = cols)\n",
+    "        df[\"Cash_deposit_total\"] = df[\"Cash_deposit_total\"].astype(float)\n",
+    "        \n",
+    "        \n",
+    "\n",
+    "        # Step 1: Compute 90th percentiles per Segment for all 3 fields\n",
+    "        percentiles = (\n",
+    "            df.groupby(\"Segment\")[[\"Cash_deposit_total\",\n",
+    "                                   \"Cash_deposit_count\"]]\n",
+    "              .quantile(0.98)\n",
+    "              .reset_index()\n",
+    "        )\n",
+    "\n",
+    "        # Rename columns for clarity\n",
+    "        percentiles = percentiles.rename(columns={\n",
+    "            \"Cash_deposit_total\": \"P90_Credit\",\n",
+    "            \"Cash_deposit_count\": \"P90_Credit_count\"\n",
+    "        })\n",
+    "\n",
+    "        # Step 2: Merge back to main df\n",
+    "        df = df.merge(percentiles, on=\"Segment\", how=\"left\")\n",
+    "\n",
+    "        # Step 3: Identify customers above 90th percentile in ANY of the 3 metrics\n",
+    "        high_pop = (\n",
+    "            (df[\"Cash_deposit_total\"] > df[\"P90_Credit\"]) &\n",
+    "            (df[\"Cash_deposit_count\"] > df[\"P90_Credit_count\"])\n",
+    "        )\n",
+    "\n",
+    "        # Step 4: Randomly select 0.1% sample from high-risk population\n",
+    "        sample_fraction = 0.1   # 0.1%\n",
+    "        high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index\n",
+    "\n",
+    "        # Step 5: Set SAR_FLAG values\n",
+    "        df[\"SAR_FLAG\"] = \"N\"   # default for all\n",
+    "        df.loc[high_pop_indices, \"SAR_FLAG\"] = \"Y\"   # assign Y to 0.1% random high-risk population\n",
+    "\n",
+    "        return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "1f20337b-8116-47e5-8743-1ba41e2df819",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# sen = Scenario()\n",
+    "# a = sen.logic()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "6de62b37-00d1-4c88-b27b-9a70e05add91",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# a[a[\"SAR_FLAG\"] == \"Y\"]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/main.py
+++ b/main.py
@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[7]:
+
+
+import pandas as pd
+
+query = """
+    select  final.CUSTOMER_NUMBER_main as Focal_id,
+        CAST(final.Cash_deposit_total AS DECIMAL(18, 2)) AS Cash_deposit_total,
+        final.Cash_deposit_count,
+        final.SEGMENT,
+        final.RISK,
+        final.SAR_FLAG
+from 
+(
+    (
+        select subquery.CUSTOMER_NUMBER_1 as CUSTOMER_NUMBER_main,
+               subquery.Cash_deposit_total,
+               subquery.Cash_deposit_count
+        from 
+        (
+            select customer_number as CUSTOMER_NUMBER_1, 
+                   sum(transaction_amount) as Cash_deposit_total, 
+                   count(*) as Cash_deposit_count
+            from 
+            (
+                select * 
+                from {trans_data} trans_table 
+                left join {acc_data} acc_table
+                on trans_table.benef_account_number = acc_table.account_number
+            ) trans
+            where account_number not in ('None')
+            and transaction_desc = 'CASH RELATED TRANSACTION'
+            group by customer_number
+        ) subquery
+    ) main 
+    left join 
+   (
+        select cd.CUSTOMER_NUMBER_3 as CUSTOMER_NUMBER_cust,
+               cd.SEGMENT,
+               cd.RISK,
+               case
+                    when ad.SAR_FLAG is NULL then 'N'
+                    else ad.SAR_FLAG
+               end as SAR_FLAG    
+        from
+        (
+            select customer_number as CUSTOMER_NUMBER_3, 
+                   business_segment as SEGMENT,
+                   case
+                       when RISK_CLASSIFICATION = 1 then 'Low Risk'
+                       when RISK_CLASSIFICATION = 2 then 'Medium Risk'
+                       when RISK_CLASSIFICATION = 3 then 'High Risk'
+                       else 'Unknown Risk'
+                   end AS RISK
+            from {cust_data}
+        ) cd 
+        left join
+        (
+            select customer_number as CUSTOMER_NUMBER_4, 
+                   sar_flag as SAR_FLAG
+            from {alert_data}
+        ) ad 
+        on cd.CUSTOMER_NUMBER_3 = ad.CUSTOMER_NUMBER_4
+    ) as cust_alert
+    on cust_alert.CUSTOMER_NUMBER_cust = main.CUSTOMER_NUMBER_main
+) as final
+"""
+
+from tms_data_interface import SQLQueryInterface
+
+class Scenario:
+    seq = SQLQueryInterface(schema="transactionschema")
+
+    def logic(self, **kwargs):
+        row_list = self.seq.execute_raw(query.format(trans_data="transaction10m",
+                                                    cust_data="customer_data_v1",
+                                                    acc_data="account_data_v1",
+                                                    alert_data="alert_data_v1")
+                                       )
+        cols = ["Focal_id", "Cash_deposit_total", "Cash_deposit_count",
+                "Segment", "Risk", "SAR_FLAG"]
+        df = pd.DataFrame(row_list, columns = cols)
+        df["Cash_deposit_total"] = df["Cash_deposit_total"].astype(float)
+        
+        
+
+        # Step 1: Compute 90th percentiles per Segment for all 3 fields
+        percentiles = (
+            df.groupby("Segment")[["Cash_deposit_total",
+                                   "Cash_deposit_count"]]
+              .quantile(0.98)
+              .reset_index()
+        )
+
+        # Rename columns for clarity
+        percentiles = percentiles.rename(columns={
+            "Cash_deposit_total": "P90_Credit",
+            "Cash_deposit_count": "P90_Credit_count"
+        })
+
+        # Step 2: Merge back to main df
+        df = df.merge(percentiles, on="Segment", how="left")
+
+        # Step 3: Identify customers above 90th percentile in ANY of the 3 metrics
+        high_pop = (
+            (df["Cash_deposit_total"] > df["P90_Credit"]) &
+            (df["Cash_deposit_count"] > df["P90_Credit_count"])
+        )
+
+        # Step 4: Randomly select 0.1% sample from high-risk population
+        sample_fraction = 0.1   # 0.1%
+        high_pop_indices = df[high_pop].sample(frac=sample_fraction, random_state=42).index
+
+        # Step 5: Set SAR_FLAG values
+        df["SAR_FLAG"] = "N"   # default for all
+        df.loc[high_pop_indices, "SAR_FLAG"] = "Y"   # assign Y to 0.1% random high-risk population
+
+        return df
+
+
+# In[8]:
+
+
+# sen = Scenario()
+# a = sen.logic()
+
+
+# In[10]:
+
+
+# a[a["SAR_FLAG"] == "Y"]
+