Added basic string sanitization

2025-02-02 14:34:30 +01:00 · 2025-02-02 14:34:30 +01:00 · afa45aa913
parent a83e27c7ed
commit afa45aa913
3 changed files with 104 additions and 2 deletions
--- a/cmd_common/common_commands.py
+++ b/cmd_common/common_commands.py
@ -40,7 +40,6 @@ def ping() -> str:

    return response

-
 def greet(target_display_name: str, platform_name: str) -> str:
    """
    Returns a greeting string for the given user displayname on a given platform.
--- a/modules/utility.py
+++ b/modules/utility.py
@ -2,6 +2,15 @@ import time
 import os
 import random
 import json
+import re
+
+try:
+    # 'regex' on PyPI supports `\p{L}`, `\p{N}`, etc. 
+    import regex
+    USE_REGEX_LIB = True
+except ImportError:
+    # Fallback to Python's built-in 're' if 'regex' isn't installed
+    USE_REGEX_LIB = False

 DICTIONARY_PATH = "dictionary/"  # Path to dictionary files

@ -67,4 +76,97 @@ def get_random_reply(dictionary_name: str, category: str, **variables) -> str:
    response = random.choice(data[category])

    # Replace placeholders with provided variables
-    return response.format(**variables)
+    return response.format(**variables)
+
+##############################
+# Basic sanitization
+# DO NOT RELY SOLELY ON THIS
+##############################
+def sanitize_user_input(
+    user_input: str,
+    usage: str = "GENERAL",
+    max_length: int = 500
+):
+    """
+    A whitelisting-based function for sanitizing user input.
+
+    Returns a tuple of:
+      (sanitized_str, sanitization_applied_bool, sanitization_reason, original_str)
+
+    :param user_input: The raw string from the user (e.g., from Twitch or Discord).
+    :param usage: 
+        - 'CALC': Keep digits, math operators, parentheses, etc.
+        - 'GENERAL': Keep typical readable characters & punctuation.
+    :param max_length: Truncate the input if it exceeds this length.
+    :return: (sanitized_str, bool, reason_string, original_str)
+
+    ======================
+    SECURITY RECOMMENDATIONS
+    ======================
+    1) For database storage (MariaDB, etc.):
+       - **Always** use parameterized queries or an ORM with bound parameters.
+       - Do not rely solely on string sanitization to prevent SQL injection.
+    
+    2) For code execution (e.g., 'eval'):
+       - Avoid using eval/exec on user input.
+       - If you must, consider a restricted math parser or an audited sandbox.
+    
+    3) For HTML sanitization:
+       - Bleach is deprecated; research modern alternatives or frameworks that
+         safely sanitize HTML output. This function does *not* sanitize HTML tags.
+    """
+
+    original_string = str(user_input)
+    reasons = []
+    sanitization_applied = False
+
+    # 1. Truncate and remove newlines, tabs, etc.
+    truncated = original_string[:max_length]
+    truncated = re.sub(r"[\r\n\t]+", " ", truncated)
+
+    sanitized = truncated
+
+    # 2. Choose how to filter based on usage
+    usage = usage.upper()
+
+    if usage == "CALC":
+        # Allow digits, +, -, *, /, %, parentheses, decimal points, ^ for exponent, spaces
+        # Remove everything else
+        pattern = r"[^0-9+\-*/%().^ \t]"
+        new_sanitized = re.sub(pattern, "", sanitized)
+        if new_sanitized != sanitized:
+            sanitization_applied = True
+            reasons.append("CALC: Removed non-math characters.")
+        sanitized = new_sanitized
+
+    else:  # GENERAL usage
+        if USE_REGEX_LIB:
+            # Remove ASCII control chars (0-31, 127) first
+            step1 = re.sub(r"[\x00-\x1F\x7F]", "", sanitized)
+            # Then apply a fairly broad whitelist:
+            # \p{L}: letters; \p{N}: numbers; \p{P}: punctuation; \p{S}: symbols; \p{Z}: separators (including spaces).
+            # This keeps emojis, foreign characters, typical punctuation, etc.
+            pattern = r"[^\p{L}\p{N}\p{P}\p{S}\p{Z}]"
+            new_sanitized = regex.sub(pattern, "", step1)
+
+            if new_sanitized != sanitized:
+                sanitization_applied = True
+                reasons.append("GENERAL: Removed disallowed chars via regex.")
+            sanitized = new_sanitized
+        else:
+            # Fallback: If 'regex' is not installed, remove control chars and keep ASCII printable only.
+            step1 = re.sub(r"[\x00-\x1F\x7F]", "", sanitized)
+            pattern = r"[^ -~]"  # Keep only ASCII 32-126
+            new_sanitized = re.sub(pattern, "", step1)
+
+            if new_sanitized != sanitized:
+                sanitization_applied = True
+                reasons.append("GENERAL: Removed non-ASCII or control chars (fallback).")
+            sanitized = new_sanitized
+
+    # 3. Final trim
+    sanitized = sanitized.strip()
+
+    # 4. Prepare output
+    reason_string = "; ".join(reasons)
+    return (sanitized, sanitization_applied, reason_string, original_string)
--- a/requirements.txt
+++ b/requirements.txt
@ -11,3 +11,4 @@ twitchio==2.7.1              # Twitch chat bot library (async)

 # Utility & Logging
 aiohttp==3.9.1               # Async HTTP requests (dependency for discord.py & twitchio)
+regex==2024.11.6             # REGular EXpressions