diff --git a/.ipynb_checkpoints/main-checkpoint.ipynb b/.ipynb_checkpoints/main-checkpoint.ipynb new file mode 100644 index 0000000..f6ac3b2 --- /dev/null +++ b/.ipynb_checkpoints/main-checkpoint.ipynb @@ -0,0 +1,188 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d3b830ff-94b7-4596-b85a-320060180a6b", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime, timedelta\n", + "import pandas as pd\n", + "import numpy as np\n", + "from tms_data_interface import SQLQueryInterface\n", + "\n", + "def apply_sar_flag(df, var1, var2, var3, random_state=42):\n", + " \"\"\"\n", + " Apply percentile-based thresholds, split data into alerting and non-alerting,\n", + " flag random 10% of alerting data as 'Y', and merge back.\n", + "\n", + " Parameters:\n", + " df (pd.DataFrame): Input dataframe\n", + " var1 (str): First variable (for 50th percentile threshold)\n", + " var2 (str): Second variable (for 50th percentile threshold)\n", + " var3 (str): Third variable (for 90th percentile threshold)\n", + " random_state (int): Seed for reproducibility\n", + "\n", + " Returns:\n", + " pd.DataFrame: DataFrame with 'SAR_Flag' column added\n", + " \"\"\"\n", + "\n", + " # Calculate thresholds\n", + " th1 = np.percentile(df[var1].dropna(), 90)\n", + " th2 = np.percentile(df[var2].dropna(), 90)\n", + " th3 = np.percentile(df[var3].dropna(), 90)\n", + "\n", + " # Split into alerting and non-alerting\n", + " alerting = df[(df[var1] >= th1) &\n", + " (df[var2] >= th2) &\n", + " (df[var3] >= th3)].copy()\n", + "\n", + " non_alerting = df.loc[~df.index.isin(alerting.index)].copy()\n", + "\n", + " # Assign SAR_Flag = 'N' for non-alerting\n", + " non_alerting['SAR_FLAG'] = 'N'\n", + "\n", + " # Assign SAR_Flag for alerting data\n", + " alerting['SAR_FLAG'] = 'N'\n", + " n_y = int(len(alerting) * 0.1) # 10% count\n", + " if n_y > 0:\n", + " y_indices = alerting.sample(n=n_y, random_state=random_state).index\n", + " alerting.loc[y_indices, 'SAR_FLAG'] = 'Y'\n", + "\n", + " # Merge back and preserve original order\n", + " final_df = pd.concat([alerting, non_alerting]).sort_index()\n", + "\n", + " return final_df\n", + "\n", + "query = \"\"\"\n", + "WITH time_windows AS (\n", + " SELECT\n", + " -- End time is the current trade time\n", + " date_time AS end_time,\n", + "\n", + " -- Subtract seconds from the end_time using date_add() with negative integer interval\n", + " date_add('second', -{time_window_s}, date_time) AS start_time,\n", + "\n", + " -- Trade details\n", + " trade_price,\n", + " trade_volume,\n", + " trader_id,\n", + "\n", + " -- Calculate minimum price within the time window\n", + " MIN(trade_price) OVER (\n", + " ORDER BY date_time \n", + " RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW\n", + " ) AS min_price,\n", + "\n", + " -- Calculate maximum price within the time window\n", + " MAX(trade_price) OVER (\n", + " ORDER BY date_time \n", + " RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW\n", + " ) AS max_price,\n", + "\n", + " -- Calculate total trade volume within the time window\n", + " SUM(trade_volume) OVER ( \n", + " ORDER BY date_time \n", + " RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW\n", + " ) AS total_volume,\n", + "\n", + " -- Calculate participant's trade volume within the time window\n", + " SUM(CASE WHEN trader_id = trader_id THEN trade_volume ELSE 0 END) OVER (\n", + " PARTITION BY trader_id \n", + " ORDER BY date_time \n", + " RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW\n", + " ) AS participant_volume\n", + " FROM\n", + " {trade_data_1b}\n", + ")\n", + "SELECT\n", + " -- Select the time window details\n", + " start_time,\n", + " end_time,\n", + "\n", + " -- Select the participant (trader) ID\n", + " trader_id AS \"Participant\",\n", + "\n", + " -- Select the calculated min and max prices\n", + " min_price,\n", + " max_price,\n", + "\n", + " -- Calculate the price change percentage\n", + " (max_price - min_price) / NULLIF(min_price, 0) * 100 AS \"Price Change (%)\",\n", + "\n", + " -- Calculate the participant's volume as a percentage of total volume\n", + " (participant_volume / NULLIF(total_volume, 0)) * 100 AS \"Volume (%)\",\n", + "\n", + " -- Participant volume\n", + " participant_volume,\n", + "\n", + " -- Select the total volume within the window\n", + " total_volume AS \"Total Volume\"\n", + "FROM\n", + " time_windows\n", + "\"\"\"\n", + "\n", + "\n", + "from tms_data_interface import SQLQueryInterface\n", + "\n", + "class Scenario:\n", + " seq = SQLQueryInterface(schema=\"trade_schema\")\n", + " def logic(self, **kwargs):\n", + " validation_window = kwargs.get('validation_window', 300000)\n", + " time_window_s = int(validation_window/1000)\n", + " query_start_time = datetime.now()\n", + " print(\"Query start time :\",query_start_time)\n", + " row_list = self.seq.execute_raw(query.format(trade_data_1b=\"trade_10m_v3\",\n", + " time_window_s = time_window_s)\n", + " )\n", + " cols = [\n", + " 'START_DATE_TIME',\n", + " 'END_DATE_TIME',\n", + " 'Focal_id',\n", + " 'MIN_PRICE',\n", + " 'MAX_PRICE',\n", + " 'PRICE_CHANGE_PCT',\n", + " 'PARTICIPANT_VOLUME_PCT',\n", + " 'PARTICIPANT_VOLUME',\n", + " 'TOTAL_VOLUME',\n", + " ]\n", + " final_scenario_df = pd.DataFrame(row_list, columns = cols)\n", + " final_scenario_df['PARTICIPANT_VOLUME_PCT'] = final_scenario_df['PARTICIPANT_VOLUME']/\\\n", + " final_scenario_df['TOTAL_VOLUME'] * 100\n", + " final_scenario_df['Segment'] = 'Default'\n", + " # final_scenario_df['SAR_FLAG'] = 'N'\n", + " final_scenario_df['Risk'] = 'Medium Risk'\n", + " final_scenario_df.dropna(inplace=True)\n", + " final_scenario_df = apply_sar_flag(final_scenario_df,\n", + " 'PRICE_CHANGE_PCT',\n", + " 'PARTICIPANT_VOLUME_PCT',\n", + " 'TOTAL_VOLUME',\n", + " random_state=42)\n", + " # final_scenario_df['RUN_DATE'] = final_scenario_df['END_DATE']\n", + " return final_scenario_df\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/main.ipynb b/main.ipynb index 3277afb..f6ac3b2 100644 --- a/main.ipynb +++ b/main.ipynb @@ -1,33 +1,188 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "e706cfb0-2234-4c4c-95d8-d1968f656aa0", - "metadata": {}, - "outputs": [], - "source": "from tms_data_interface import SQLQueryInterface\n\nclass Scenario:\n\tseq = SQLQueryInterface()\n\n\tdef logic(self, **kwargs):\n\t\t# Write your code here\n" - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d3b830ff-94b7-4596-b85a-320060180a6b", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime, timedelta\n", + "import pandas as pd\n", + "import numpy as np\n", + "from tms_data_interface import SQLQueryInterface\n", + "\n", + "def apply_sar_flag(df, var1, var2, var3, random_state=42):\n", + " \"\"\"\n", + " Apply percentile-based thresholds, split data into alerting and non-alerting,\n", + " flag random 10% of alerting data as 'Y', and merge back.\n", + "\n", + " Parameters:\n", + " df (pd.DataFrame): Input dataframe\n", + " var1 (str): First variable (for 50th percentile threshold)\n", + " var2 (str): Second variable (for 50th percentile threshold)\n", + " var3 (str): Third variable (for 90th percentile threshold)\n", + " random_state (int): Seed for reproducibility\n", + "\n", + " Returns:\n", + " pd.DataFrame: DataFrame with 'SAR_Flag' column added\n", + " \"\"\"\n", + "\n", + " # Calculate thresholds\n", + " th1 = np.percentile(df[var1].dropna(), 90)\n", + " th2 = np.percentile(df[var2].dropna(), 90)\n", + " th3 = np.percentile(df[var3].dropna(), 90)\n", + "\n", + " # Split into alerting and non-alerting\n", + " alerting = df[(df[var1] >= th1) &\n", + " (df[var2] >= th2) &\n", + " (df[var3] >= th3)].copy()\n", + "\n", + " non_alerting = df.loc[~df.index.isin(alerting.index)].copy()\n", + "\n", + " # Assign SAR_Flag = 'N' for non-alerting\n", + " non_alerting['SAR_FLAG'] = 'N'\n", + "\n", + " # Assign SAR_Flag for alerting data\n", + " alerting['SAR_FLAG'] = 'N'\n", + " n_y = int(len(alerting) * 0.1) # 10% count\n", + " if n_y > 0:\n", + " y_indices = alerting.sample(n=n_y, random_state=random_state).index\n", + " alerting.loc[y_indices, 'SAR_FLAG'] = 'Y'\n", + "\n", + " # Merge back and preserve original order\n", + " final_df = pd.concat([alerting, non_alerting]).sort_index()\n", + "\n", + " return final_df\n", + "\n", + "query = \"\"\"\n", + "WITH time_windows AS (\n", + " SELECT\n", + " -- End time is the current trade time\n", + " date_time AS end_time,\n", + "\n", + " -- Subtract seconds from the end_time using date_add() with negative integer interval\n", + " date_add('second', -{time_window_s}, date_time) AS start_time,\n", + "\n", + " -- Trade details\n", + " trade_price,\n", + " trade_volume,\n", + " trader_id,\n", + "\n", + " -- Calculate minimum price within the time window\n", + " MIN(trade_price) OVER (\n", + " ORDER BY date_time \n", + " RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW\n", + " ) AS min_price,\n", + "\n", + " -- Calculate maximum price within the time window\n", + " MAX(trade_price) OVER (\n", + " ORDER BY date_time \n", + " RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW\n", + " ) AS max_price,\n", + "\n", + " -- Calculate total trade volume within the time window\n", + " SUM(trade_volume) OVER ( \n", + " ORDER BY date_time \n", + " RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW\n", + " ) AS total_volume,\n", + "\n", + " -- Calculate participant's trade volume within the time window\n", + " SUM(CASE WHEN trader_id = trader_id THEN trade_volume ELSE 0 END) OVER (\n", + " PARTITION BY trader_id \n", + " ORDER BY date_time \n", + " RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW\n", + " ) AS participant_volume\n", + " FROM\n", + " {trade_data_1b}\n", + ")\n", + "SELECT\n", + " -- Select the time window details\n", + " start_time,\n", + " end_time,\n", + "\n", + " -- Select the participant (trader) ID\n", + " trader_id AS \"Participant\",\n", + "\n", + " -- Select the calculated min and max prices\n", + " min_price,\n", + " max_price,\n", + "\n", + " -- Calculate the price change percentage\n", + " (max_price - min_price) / NULLIF(min_price, 0) * 100 AS \"Price Change (%)\",\n", + "\n", + " -- Calculate the participant's volume as a percentage of total volume\n", + " (participant_volume / NULLIF(total_volume, 0)) * 100 AS \"Volume (%)\",\n", + "\n", + " -- Participant volume\n", + " participant_volume,\n", + "\n", + " -- Select the total volume within the window\n", + " total_volume AS \"Total Volume\"\n", + "FROM\n", + " time_windows\n", + "\"\"\"\n", + "\n", + "\n", + "from tms_data_interface import SQLQueryInterface\n", + "\n", + "class Scenario:\n", + " seq = SQLQueryInterface(schema=\"trade_schema\")\n", + " def logic(self, **kwargs):\n", + " validation_window = kwargs.get('validation_window', 300000)\n", + " time_window_s = int(validation_window/1000)\n", + " query_start_time = datetime.now()\n", + " print(\"Query start time :\",query_start_time)\n", + " row_list = self.seq.execute_raw(query.format(trade_data_1b=\"trade_10m_v3\",\n", + " time_window_s = time_window_s)\n", + " )\n", + " cols = [\n", + " 'START_DATE_TIME',\n", + " 'END_DATE_TIME',\n", + " 'Focal_id',\n", + " 'MIN_PRICE',\n", + " 'MAX_PRICE',\n", + " 'PRICE_CHANGE_PCT',\n", + " 'PARTICIPANT_VOLUME_PCT',\n", + " 'PARTICIPANT_VOLUME',\n", + " 'TOTAL_VOLUME',\n", + " ]\n", + " final_scenario_df = pd.DataFrame(row_list, columns = cols)\n", + " final_scenario_df['PARTICIPANT_VOLUME_PCT'] = final_scenario_df['PARTICIPANT_VOLUME']/\\\n", + " final_scenario_df['TOTAL_VOLUME'] * 100\n", + " final_scenario_df['Segment'] = 'Default'\n", + " # final_scenario_df['SAR_FLAG'] = 'N'\n", + " final_scenario_df['Risk'] = 'Medium Risk'\n", + " final_scenario_df.dropna(inplace=True)\n", + " final_scenario_df = apply_sar_flag(final_scenario_df,\n", + " 'PRICE_CHANGE_PCT',\n", + " 'PARTICIPANT_VOLUME_PCT',\n", + " 'TOTAL_VOLUME',\n", + " random_state=42)\n", + " # final_scenario_df['RUN_DATE'] = final_scenario_df['END_DATE']\n", + " return final_scenario_df\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/main.py b/main.py new file mode 100644 index 0000000..7d38ee2 --- /dev/null +++ b/main.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[ ]: + + +from datetime import datetime, timedelta +import pandas as pd +import numpy as np +from tms_data_interface import SQLQueryInterface + +def apply_sar_flag(df, var1, var2, var3, random_state=42): + """ + Apply percentile-based thresholds, split data into alerting and non-alerting, + flag random 10% of alerting data as 'Y', and merge back. + + Parameters: + df (pd.DataFrame): Input dataframe + var1 (str): First variable (for 50th percentile threshold) + var2 (str): Second variable (for 50th percentile threshold) + var3 (str): Third variable (for 90th percentile threshold) + random_state (int): Seed for reproducibility + + Returns: + pd.DataFrame: DataFrame with 'SAR_Flag' column added + """ + + # Calculate thresholds + th1 = np.percentile(df[var1].dropna(), 90) + th2 = np.percentile(df[var2].dropna(), 90) + th3 = np.percentile(df[var3].dropna(), 90) + + # Split into alerting and non-alerting + alerting = df[(df[var1] >= th1) & + (df[var2] >= th2) & + (df[var3] >= th3)].copy() + + non_alerting = df.loc[~df.index.isin(alerting.index)].copy() + + # Assign SAR_Flag = 'N' for non-alerting + non_alerting['SAR_FLAG'] = 'N' + + # Assign SAR_Flag for alerting data + alerting['SAR_FLAG'] = 'N' + n_y = int(len(alerting) * 0.1) # 10% count + if n_y > 0: + y_indices = alerting.sample(n=n_y, random_state=random_state).index + alerting.loc[y_indices, 'SAR_FLAG'] = 'Y' + + # Merge back and preserve original order + final_df = pd.concat([alerting, non_alerting]).sort_index() + + return final_df + +query = """ +WITH time_windows AS ( + SELECT + -- End time is the current trade time + date_time AS end_time, + + -- Subtract seconds from the end_time using date_add() with negative integer interval + date_add('second', -{time_window_s}, date_time) AS start_time, + + -- Trade details + trade_price, + trade_volume, + trader_id, + + -- Calculate minimum price within the time window + MIN(trade_price) OVER ( + ORDER BY date_time + RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW + ) AS min_price, + + -- Calculate maximum price within the time window + MAX(trade_price) OVER ( + ORDER BY date_time + RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW + ) AS max_price, + + -- Calculate total trade volume within the time window + SUM(trade_volume) OVER ( + ORDER BY date_time + RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW + ) AS total_volume, + + -- Calculate participant's trade volume within the time window + SUM(CASE WHEN trader_id = trader_id THEN trade_volume ELSE 0 END) OVER ( + PARTITION BY trader_id + ORDER BY date_time + RANGE BETWEEN INTERVAL '{time_window_s}' SECOND PRECEDING AND CURRENT ROW + ) AS participant_volume + FROM + {trade_data_1b} +) +SELECT + -- Select the time window details + start_time, + end_time, + + -- Select the participant (trader) ID + trader_id AS "Participant", + + -- Select the calculated min and max prices + min_price, + max_price, + + -- Calculate the price change percentage + (max_price - min_price) / NULLIF(min_price, 0) * 100 AS "Price Change (%)", + + -- Calculate the participant's volume as a percentage of total volume + (participant_volume / NULLIF(total_volume, 0)) * 100 AS "Volume (%)", + + -- Participant volume + participant_volume, + + -- Select the total volume within the window + total_volume AS "Total Volume" +FROM + time_windows +""" + + +from tms_data_interface import SQLQueryInterface + +class Scenario: + seq = SQLQueryInterface(schema="trade_schema") + def logic(self, **kwargs): + validation_window = kwargs.get('validation_window', 300000) + time_window_s = int(validation_window/1000) + query_start_time = datetime.now() + print("Query start time :",query_start_time) + row_list = self.seq.execute_raw(query.format(trade_data_1b="trade_10m_v3", + time_window_s = time_window_s) + ) + cols = [ + 'START_DATE_TIME', + 'END_DATE_TIME', + 'Focal_id', + 'MIN_PRICE', + 'MAX_PRICE', + 'PRICE_CHANGE_PCT', + 'PARTICIPANT_VOLUME_PCT', + 'PARTICIPANT_VOLUME', + 'TOTAL_VOLUME', + ] + final_scenario_df = pd.DataFrame(row_list, columns = cols) + final_scenario_df['PARTICIPANT_VOLUME_PCT'] = final_scenario_df['PARTICIPANT_VOLUME']/\ + final_scenario_df['TOTAL_VOLUME'] * 100 + final_scenario_df['Segment'] = 'Default' + # final_scenario_df['SAR_FLAG'] = 'N' + final_scenario_df['Risk'] = 'Medium Risk' + final_scenario_df.dropna(inplace=True) + final_scenario_df = apply_sar_flag(final_scenario_df, + 'PRICE_CHANGE_PCT', + 'PARTICIPANT_VOLUME_PCT', + 'TOTAL_VOLUME', + random_state=42) + # final_scenario_df['RUN_DATE'] = final_scenario_df['END_DATE'] + return final_scenario_df +