diff --git a/cmd_common/common_commands.py b/cmd_common/common_commands.py index 7b9432c..1eb0d0c 100644 --- a/cmd_common/common_commands.py +++ b/cmd_common/common_commands.py @@ -40,7 +40,6 @@ def ping() -> str: return response - def greet(target_display_name: str, platform_name: str) -> str: """ Returns a greeting string for the given user displayname on a given platform. diff --git a/modules/utility.py b/modules/utility.py index 4d546b1..d729da0 100644 --- a/modules/utility.py +++ b/modules/utility.py @@ -2,6 +2,15 @@ import time import os import random import json +import re + +try: + # 'regex' on PyPI supports `\p{L}`, `\p{N}`, etc. + import regex + USE_REGEX_LIB = True +except ImportError: + # Fallback to Python's built-in 're' if 'regex' isn't installed + USE_REGEX_LIB = False DICTIONARY_PATH = "dictionary/" # Path to dictionary files @@ -67,4 +76,97 @@ def get_random_reply(dictionary_name: str, category: str, **variables) -> str: response = random.choice(data[category]) # Replace placeholders with provided variables - return response.format(**variables) \ No newline at end of file + return response.format(**variables) + +############################## +# Basic sanitization +# DO NOT RELY SOLELY ON THIS +############################## +def sanitize_user_input( + user_input: str, + usage: str = "GENERAL", + max_length: int = 500 +): + """ + A whitelisting-based function for sanitizing user input. + + Returns a tuple of: + (sanitized_str, sanitization_applied_bool, sanitization_reason, original_str) + + :param user_input: The raw string from the user (e.g., from Twitch or Discord). + :param usage: + - 'CALC': Keep digits, math operators, parentheses, etc. + - 'GENERAL': Keep typical readable characters & punctuation. + :param max_length: Truncate the input if it exceeds this length. + :return: (sanitized_str, bool, reason_string, original_str) + + ====================== + SECURITY RECOMMENDATIONS + ====================== + 1) For database storage (MariaDB, etc.): + - **Always** use parameterized queries or an ORM with bound parameters. + - Do not rely solely on string sanitization to prevent SQL injection. + + 2) For code execution (e.g., 'eval'): + - Avoid using eval/exec on user input. + - If you must, consider a restricted math parser or an audited sandbox. + + 3) For HTML sanitization: + - Bleach is deprecated; research modern alternatives or frameworks that + safely sanitize HTML output. This function does *not* sanitize HTML tags. + """ + + original_string = str(user_input) + reasons = [] + sanitization_applied = False + + # 1. Truncate and remove newlines, tabs, etc. + truncated = original_string[:max_length] + truncated = re.sub(r"[\r\n\t]+", " ", truncated) + + sanitized = truncated + + # 2. Choose how to filter based on usage + usage = usage.upper() + + if usage == "CALC": + # Allow digits, +, -, *, /, %, parentheses, decimal points, ^ for exponent, spaces + # Remove everything else + pattern = r"[^0-9+\-*/%().^ \t]" + new_sanitized = re.sub(pattern, "", sanitized) + if new_sanitized != sanitized: + sanitization_applied = True + reasons.append("CALC: Removed non-math characters.") + sanitized = new_sanitized + + else: # GENERAL usage + if USE_REGEX_LIB: + # Remove ASCII control chars (0-31, 127) first + step1 = re.sub(r"[\x00-\x1F\x7F]", "", sanitized) + # Then apply a fairly broad whitelist: + # \p{L}: letters; \p{N}: numbers; \p{P}: punctuation; \p{S}: symbols; \p{Z}: separators (including spaces). + # This keeps emojis, foreign characters, typical punctuation, etc. + pattern = r"[^\p{L}\p{N}\p{P}\p{S}\p{Z}]" + new_sanitized = regex.sub(pattern, "", step1) + + if new_sanitized != sanitized: + sanitization_applied = True + reasons.append("GENERAL: Removed disallowed chars via regex.") + sanitized = new_sanitized + else: + # Fallback: If 'regex' is not installed, remove control chars and keep ASCII printable only. + step1 = re.sub(r"[\x00-\x1F\x7F]", "", sanitized) + pattern = r"[^ -~]" # Keep only ASCII 32-126 + new_sanitized = re.sub(pattern, "", step1) + + if new_sanitized != sanitized: + sanitization_applied = True + reasons.append("GENERAL: Removed non-ASCII or control chars (fallback).") + sanitized = new_sanitized + + # 3. Final trim + sanitized = sanitized.strip() + + # 4. Prepare output + reason_string = "; ".join(reasons) + return (sanitized, sanitization_applied, reason_string, original_string) diff --git a/requirements.txt b/requirements.txt index 9049826..af4abe5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ twitchio==2.7.1 # Twitch chat bot library (async) # Utility & Logging aiohttp==3.9.1 # Async HTTP requests (dependency for discord.py & twitchio) +regex==2024.11.6 # REGular EXpressions