Added basic string sanitization
parent
a83e27c7ed
commit
afa45aa913
|
@ -40,7 +40,6 @@ def ping() -> str:
|
|||
|
||||
return response
|
||||
|
||||
|
||||
def greet(target_display_name: str, platform_name: str) -> str:
|
||||
"""
|
||||
Returns a greeting string for the given user displayname on a given platform.
|
||||
|
|
|
@ -2,6 +2,15 @@ import time
|
|||
import os
|
||||
import random
|
||||
import json
|
||||
import re
|
||||
|
||||
try:
|
||||
# 'regex' on PyPI supports `\p{L}`, `\p{N}`, etc.
|
||||
import regex
|
||||
USE_REGEX_LIB = True
|
||||
except ImportError:
|
||||
# Fallback to Python's built-in 're' if 'regex' isn't installed
|
||||
USE_REGEX_LIB = False
|
||||
|
||||
DICTIONARY_PATH = "dictionary/" # Path to dictionary files
|
||||
|
||||
|
@ -67,4 +76,97 @@ def get_random_reply(dictionary_name: str, category: str, **variables) -> str:
|
|||
response = random.choice(data[category])
|
||||
|
||||
# Replace placeholders with provided variables
|
||||
return response.format(**variables)
|
||||
return response.format(**variables)
|
||||
|
||||
##############################
|
||||
# Basic sanitization
|
||||
# DO NOT RELY SOLELY ON THIS
|
||||
##############################
|
||||
def sanitize_user_input(
|
||||
user_input: str,
|
||||
usage: str = "GENERAL",
|
||||
max_length: int = 500
|
||||
):
|
||||
"""
|
||||
A whitelisting-based function for sanitizing user input.
|
||||
|
||||
Returns a tuple of:
|
||||
(sanitized_str, sanitization_applied_bool, sanitization_reason, original_str)
|
||||
|
||||
:param user_input: The raw string from the user (e.g., from Twitch or Discord).
|
||||
:param usage:
|
||||
- 'CALC': Keep digits, math operators, parentheses, etc.
|
||||
- 'GENERAL': Keep typical readable characters & punctuation.
|
||||
:param max_length: Truncate the input if it exceeds this length.
|
||||
:return: (sanitized_str, bool, reason_string, original_str)
|
||||
|
||||
======================
|
||||
SECURITY RECOMMENDATIONS
|
||||
======================
|
||||
1) For database storage (MariaDB, etc.):
|
||||
- **Always** use parameterized queries or an ORM with bound parameters.
|
||||
- Do not rely solely on string sanitization to prevent SQL injection.
|
||||
|
||||
2) For code execution (e.g., 'eval'):
|
||||
- Avoid using eval/exec on user input.
|
||||
- If you must, consider a restricted math parser or an audited sandbox.
|
||||
|
||||
3) For HTML sanitization:
|
||||
- Bleach is deprecated; research modern alternatives or frameworks that
|
||||
safely sanitize HTML output. This function does *not* sanitize HTML tags.
|
||||
"""
|
||||
|
||||
original_string = str(user_input)
|
||||
reasons = []
|
||||
sanitization_applied = False
|
||||
|
||||
# 1. Truncate and remove newlines, tabs, etc.
|
||||
truncated = original_string[:max_length]
|
||||
truncated = re.sub(r"[\r\n\t]+", " ", truncated)
|
||||
|
||||
sanitized = truncated
|
||||
|
||||
# 2. Choose how to filter based on usage
|
||||
usage = usage.upper()
|
||||
|
||||
if usage == "CALC":
|
||||
# Allow digits, +, -, *, /, %, parentheses, decimal points, ^ for exponent, spaces
|
||||
# Remove everything else
|
||||
pattern = r"[^0-9+\-*/%().^ \t]"
|
||||
new_sanitized = re.sub(pattern, "", sanitized)
|
||||
if new_sanitized != sanitized:
|
||||
sanitization_applied = True
|
||||
reasons.append("CALC: Removed non-math characters.")
|
||||
sanitized = new_sanitized
|
||||
|
||||
else: # GENERAL usage
|
||||
if USE_REGEX_LIB:
|
||||
# Remove ASCII control chars (0-31, 127) first
|
||||
step1 = re.sub(r"[\x00-\x1F\x7F]", "", sanitized)
|
||||
# Then apply a fairly broad whitelist:
|
||||
# \p{L}: letters; \p{N}: numbers; \p{P}: punctuation; \p{S}: symbols; \p{Z}: separators (including spaces).
|
||||
# This keeps emojis, foreign characters, typical punctuation, etc.
|
||||
pattern = r"[^\p{L}\p{N}\p{P}\p{S}\p{Z}]"
|
||||
new_sanitized = regex.sub(pattern, "", step1)
|
||||
|
||||
if new_sanitized != sanitized:
|
||||
sanitization_applied = True
|
||||
reasons.append("GENERAL: Removed disallowed chars via regex.")
|
||||
sanitized = new_sanitized
|
||||
else:
|
||||
# Fallback: If 'regex' is not installed, remove control chars and keep ASCII printable only.
|
||||
step1 = re.sub(r"[\x00-\x1F\x7F]", "", sanitized)
|
||||
pattern = r"[^ -~]" # Keep only ASCII 32-126
|
||||
new_sanitized = re.sub(pattern, "", step1)
|
||||
|
||||
if new_sanitized != sanitized:
|
||||
sanitization_applied = True
|
||||
reasons.append("GENERAL: Removed non-ASCII or control chars (fallback).")
|
||||
sanitized = new_sanitized
|
||||
|
||||
# 3. Final trim
|
||||
sanitized = sanitized.strip()
|
||||
|
||||
# 4. Prepare output
|
||||
reason_string = "; ".join(reasons)
|
||||
return (sanitized, sanitization_applied, reason_string, original_string)
|
||||
|
|
|
@ -11,3 +11,4 @@ twitchio==2.7.1 # Twitch chat bot library (async)
|
|||
|
||||
# Utility & Logging
|
||||
aiohttp==3.9.1 # Async HTTP requests (dependency for discord.py & twitchio)
|
||||
regex==2024.11.6 # REGular EXpressions
|
||||
|
|
Loading…
Reference in New Issue