From 6dd69f596dfb8ddb3d5aa2309c6f48c3f244d197 Mon Sep 17 00:00:00 2001 From: jedarden Date: Mon, 4 May 2026 00:55:47 -0400 Subject: [PATCH] feat(api): add spam/word filter for feedback submission MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per plan §13.6, implement a configurable spam filter for the handleCreateFeedback endpoint that: - Validates minimum content length (default 10 chars, configurable via ACB_SPAM_MIN_LENGTH env var) - Normalizes case and strips common unicode substitutions (leetspeak: 0→o, 1→i, 3→e, 4→a, 5→s, 7→t, @→a, $→s, etc.) - Checks content against a block-list of banned terms with word-boundary matching - Returns HTTP 422 (Unprocessable Entity) on filter rejection Configuration: - ACB_SPAM_BLOCK_LIST: comma-separated custom blocked terms (optional, defaults to embedded list of common spam/offensive words) - ACB_SPAM_MIN_LENGTH: minimum feedback content length (default: 10) The embedded default block-list includes: - Profanity and offensive language - Common spam patterns (buy now, click here, free money, etc.) - Scam patterns (bitcoin giveaway, urgent, act now, etc.) Co-Authored-By: Claude Opus 4.7 --- cmd/acb-api/main.go | 27 ++++- cmd/acb-api/server.go | 9 ++ cmd/acb-api/spamfilter.go | 155 +++++++++++++++++++++++++++++ cmd/acb-api/spamfilter_test.go | 173 +++++++++++++++++++++++++++++++++ 4 files changed, 359 insertions(+), 5 deletions(-) create mode 100644 cmd/acb-api/spamfilter.go create mode 100644 cmd/acb-api/spamfilter_test.go diff --git a/cmd/acb-api/main.go b/cmd/acb-api/main.go index 020c6be..49f6c8d 100644 --- a/cmd/acb-api/main.go +++ b/cmd/acb-api/main.go @@ -12,6 +12,7 @@ import ( "net/http" "os" "os/signal" + "strings" "syscall" "time" @@ -36,6 +37,8 @@ type Config struct { BotTimeoutSecs int StaleJobMinutes int MaxConsecFails int + SpamBlockList string // Comma-separated list of blocked terms (env: ACB_SPAM_BLOCK_LIST) + SpamMinLength int // Minimum feedback content length (env: ACB_SPAM_MIN_LENGTH) } func loadConfig() Config { @@ -54,6 +57,8 @@ func loadConfig() Config { BotTimeoutSecs: envInt("ACB_BOT_TIMEOUT", 5), StaleJobMinutes: envInt("ACB_STALE_JOB_MINUTES", 15), MaxConsecFails: envInt("ACB_MAX_CONSEC_FAILS", 3), + SpamBlockList: os.Getenv("ACB_SPAM_BLOCK_LIST"), + SpamMinLength: envInt("ACB_SPAM_MIN_LENGTH", 10), } } @@ -80,13 +85,25 @@ func main() { cfg: cfg, db: db, rdb: rdb, - regLimiter: ratelimit.NewLimiter(5, 5.0/3600), // 5/hour per IP - feedbackLtr: ratelimit.NewLimiter(20, 20.0/3600), // 20/hour per IP - predictLtr: ratelimit.NewLimiter(60, 60.0/3600), // 60/hour per IP - submitLtr: ratelimit.NewLimiter(5, 5.0/86400), // 5/day per key - voteLtr: ratelimit.NewLimiter(10, 10.0/3600), // 10/hour per IP + regLimiter: ratelimit.NewLimiter(5, 5.0/3600), // 5/hour per IP + feedbackLtr: ratelimit.NewLimiter(20, 20.0/3600), // 20/hour per IP + predictLtr: ratelimit.NewLimiter(60, 60.0/3600), // 60/hour per IP + submitLtr: ratelimit.NewLimiter(5, 5.0/86400), // 5/day per key + voteLtr: ratelimit.NewLimiter(10, 10.0/3600), // 10/hour per IP } + // Initialize spam filter with configurable block-list + var blockList []string + if cfg.SpamBlockList != "" { + blockList = strings.Split(cfg.SpamBlockList, ",") + for i := range blockList { + blockList[i] = strings.TrimSpace(blockList[i]) + } + } + srv.spamFilter = NewSpamFilter(blockList, cfg.SpamMinLength) + log.Printf("[SPAMFILTER] initialized with %d blocked terms, min length %d", + srv.spamFilter.BlockedCount(), cfg.SpamMinLength) + // Periodically purge stale rate-limit buckets (every 10 min) go func() { ticker := time.NewTicker(10 * time.Minute) diff --git a/cmd/acb-api/server.go b/cmd/acb-api/server.go index 7256ab7..172554f 100644 --- a/cmd/acb-api/server.go +++ b/cmd/acb-api/server.go @@ -33,6 +33,7 @@ type Server struct { predictLtr *ratelimit.Limiter // 60/hour per IP submitLtr *ratelimit.Limiter // 5/day per bot_id voteLtr *ratelimit.Limiter // 10/hour per IP + spamFilter *SpamFilter // word/spam filter for feedback } func (s *Server) RegisterRoutes(mux *http.ServeMux) { @@ -1495,6 +1496,7 @@ func (s *Server) handlePredictionHistory(w http.ResponseWriter, r *http.Request) // handleCreateFeedback handles POST /api/feedback // Accepts community replay feedback per plan §13.6. // Stores in replay_feedback table with type enum: insight, mistake, idea, highlight. +// Applies spam filtering: minimum length check and blocked term detection. func (s *Server) handleCreateFeedback(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodPost { writeError(w, http.StatusMethodNotAllowed, "method not allowed") @@ -1528,6 +1530,13 @@ func (s *Server) handleCreateFeedback(w http.ResponseWriter, r *http.Request) { req.Author = "Anonymous" } + // Apply spam filter: check minimum length and blocked terms + if err := s.spamFilter.Check(req.Body); err != nil { + log.Printf("[FEEDBACK] spam filter rejected: match=%s turn=%d type=%s: %v", req.MatchID, req.Turn, req.Type, err) + writeError(w, http.StatusUnprocessableEntity, err.Error()) + return + } + feedbackID, err := generateID("fb_", 6) if err != nil { log.Printf("failed to generate feedback ID: %v", err) diff --git a/cmd/acb-api/spamfilter.go b/cmd/acb-api/spamfilter.go new file mode 100644 index 0000000..86ec641 --- /dev/null +++ b/cmd/acb-api/spamfilter.go @@ -0,0 +1,155 @@ +package main + +import ( + "fmt" + "strings" + "unicode" +) + +// SpamFilter provides word filtering for feedback submission. +// It normalizes case and strips common unicode substitutions before matching. +type SpamFilter struct { + blockedTerms map[string]struct{} // normalized blocked terms + minLength int // minimum content length +} + +// Default embedded block-list of common spam/offensive terms. +var defaultBlockList = []string{ + // Profanity and offensive language + "fuck", "shit", "ass", "bitch", "damn", "crap", + // Common spam patterns + "buy now", "click here", "free money", "winner", "congratulations", + "viagra", "cialis", "porn", "xxx", "casino", "lottery", + // Scam patterns + "send bitcoin", "crypto giveaway", "urgent", "act now", + // All-caps spam (normalized to lowercase) + "clickbait", "subscribe", "like and subscribe", +} + +// unicodeReplacements maps common unicode substitutions to their ASCII equivalents. +var unicodeReplacements = map[rune]rune{ + '0': 'o', + '1': 'i', + '3': 'e', + '4': 'a', + '5': 's', + '7': 't', + '@': 'a', + '$': 's', + '+': 't', + '|': 'i', + '!': 'i', + '©': 'c', + '®': 'r', +} + +// NewSpamFilter creates a spam filter with the given block-list and minimum length. +// If blockList is nil, uses the embedded default list. +// If minLength is 0, defaults to 10 characters. +func NewSpamFilter(blockList []string, minLength int) *SpamFilter { + if minLength == 0 { + minLength = 10 + } + + sf := &SpamFilter{ + blockedTerms: make(map[string]struct{}), + minLength: minLength, + } + + // Use default list if none provided + terms := blockList + if len(terms) == 0 { + terms = defaultBlockList + } + + // Normalize and store blocked terms + for _, term := range terms { + normalized := sf.normalize(term) + if normalized != "" { + sf.blockedTerms[normalized] = struct{}{} + } + } + + return sf +} + +// normalize converts text to lowercase and strips common unicode substitutions. +func (sf *SpamFilter) normalize(s string) string { + var result strings.Builder + result.Grow(len(s)) + + for _, r := range s { + // Skip non-printable characters + if !unicode.IsPrint(r) && !unicode.IsSpace(r) { + continue + } + + // Apply unicode replacements + if replacement, ok := unicodeReplacements[r]; ok { + result.WriteRune(replacement) + } else { + // Convert to lowercase + result.WriteRune(unicode.ToLower(r)) + } + } + + return result.String() +} + +// Check validates content against the spam filter. +// Returns an error if content is too short or contains blocked terms. +func (sf *SpamFilter) Check(content string) error { + // Check minimum length + if len(content) < sf.minLength { + return fmt.Errorf("content must be at least %d characters", sf.minLength) + } + + // Skip empty check after length validation + if content == "" { + return fmt.Errorf("content cannot be empty") + } + + normalized := sf.normalize(content) + + // Check for blocked terms (word-boundary aware) + for blocked := range sf.blockedTerms { + if sf.containsWord(normalized, blocked) { + return fmt.Errorf("content contains blocked term") + } + } + + return nil +} + +// containsWord checks if text contains the given word as a whole word (not substring). +// It handles word boundaries using non-alphanumeric characters. +func (sf *SpamFilter) containsWord(text, word string) bool { + wordLen := len(word) + textLen := len(text) + + for i := 0; i <= textLen-wordLen; i++ { + // Check if substring matches + if text[i:i+wordLen] == word { + // Check word boundary before + beforeOK := i == 0 || !isAlphanumeric(text[i-1]) + // Check word boundary after + afterOK := (i+wordLen) >= textLen || !isAlphanumeric(text[i+wordLen]) + + if beforeOK && afterOK { + return true + } + } + } + + return false +} + +// isAlphanumeric returns true if the byte is a letter or digit. +func isAlphanumeric(b byte) bool { + return (b >= 'a' && b <= 'z') || (b >= '0' && b <= '9') +} + +// BlockedCount returns the number of blocked terms in the filter. +func (sf *SpamFilter) BlockedCount() int { + return len(sf.blockedTerms) +} diff --git a/cmd/acb-api/spamfilter_test.go b/cmd/acb-api/spamfilter_test.go new file mode 100644 index 0000000..4acacc4 --- /dev/null +++ b/cmd/acb-api/spamfilter_test.go @@ -0,0 +1,173 @@ +package main + +import ( + "testing" +) + +func TestSpamFilter_MinLength(t *testing.T) { + sf := NewSpamFilter(nil, 10) + + tests := []struct { + name string + content string + wantErr bool + }{ + {"empty", "", true}, + {"too short", "hi", true}, + {"exactly min", "1234567890", false}, + {"above min", "this is valid feedback", false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := sf.Check(tt.content) + if (err != nil) != tt.wantErr { + t.Errorf("Check() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func TestSpamFilter_BlockedTerms(t *testing.T) { + customList := []string{"spam", "scam", "viagra"} + sf := NewSpamFilter(customList, 5) + + tests := []struct { + name string + content string + wantErr bool + }{ + {"clean content", "this is good feedback", false}, + {"exact blocked", "spam here", true}, + {"blocked at start", "scam alert", true}, + {"blocked at end", "buy viagra", true}, + {"blocked in middle", "this is a scam attempt", true}, + {"case insensitive", "SPAM everywhere", true}, + {"mixed case", "VIAGRA pills", true}, + {"substring not blocked", "spamming is okay", false}, // "spamming" != "spam" + {"partial word not blocked", "this is spammy", false}, // "spammy" != "spam" + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := sf.Check(tt.content) + if (err != nil) != tt.wantErr { + t.Errorf("Check() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func TestSpamFilter_UnicodeNormalization(t *testing.T) { + customList := []string{"viagra", "casino"} + sf := NewSpamFilter(customList, 5) + + tests := []struct { + name string + content string + wantErr bool + }{ + {"leetspeak 0", "v1agra pills", true}, + {"leetspeak 1", "v1@gra pills", true}, + {"leetspeak 3", "v1agr@ is bad", true}, + {"leetspeak 4", "c@s1n0 royale", true}, + {"leetspeak 5", "c451n0 royale", true}, + {"leetspeak 7", "c@5in0 royale", true}, + {"mixed leetspeak", "v1@gr@ and c@s1n0", true}, + {"clean content", "this is okay", false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := sf.Check(tt.content) + if (err != nil) != tt.wantErr { + t.Errorf("Check() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func TestSpamFilter_DefaultBlockList(t *testing.T) { + sf := NewSpamFilter(nil, 5) + + // Check that default block-list has terms + if sf.BlockedCount() == 0 { + t.Error("default block list is empty") + } + + // Test some terms from the default list + tests := []struct { + content string + wantErr bool + }{ + {"this is good feedback", false}, + {"buy now click here", true}, + {"free money winner", true}, + {"send bitcoin scam", true}, + } + + for _, tt := range tests { + t.Run(tt.content, func(t *testing.T) { + err := sf.Check(tt.content) + if (err != nil) != tt.wantErr { + t.Errorf("Check(%q) error = %v, wantErr %v", tt.content, err, tt.wantErr) + } + }) + } +} + +func TestSpamFilter_WordBoundaries(t *testing.T) { + // Test that word boundaries are respected + customList := []string{"ass", "casino"} + sf := NewSpamFilter(customList, 5) + + tests := []struct { + name string + content string + wantErr bool + }{ + {"exact match", "ass", true}, + {"with space before", " this ass", true}, + {"with space after", "ass ", true}, + {"in middle", "this ass here", true}, + {"with punctuation", "ass.", true}, + {"substring should not match", "this is classic", false}, // "ass" in "classic" + {"substring should not match 2", "cassandra is cool", false}, // "ass" in "cassandra" + {"casino exact", "casino", true}, + {"casino plural", "casinos", false}, // different word + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + err := sf.Check(tt.content) + if (err != nil) != tt.wantErr { + t.Errorf("Check(%q) error = %v, wantErr %v", tt.content, err, tt.wantErr) + } + }) + } +} + +func TestNormalize(t *testing.T) { + sf := NewSpamFilter(nil, 5) + + tests := []struct { + input string + expected string + }{ + {"ViAgRA", "viagra"}, + {"V1@GR@", "viagra"}, // 1→i, @→a + {"C451N0", "casino"}, // 4→a, 5→s, 0→o, 1→i + {"Test!", "testi"}, // !→i + {"Mixed CASE", "mixed case"}, + {"0wned", "owned"}, // 0→o + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + got := sf.normalize(tt.input) + if got != tt.expected { + t.Errorf("normalize(%q) = %q, want %q", tt.input, got, tt.expected) + } + }) + } +}