feat(api): add spam/word filter for feedback submission

Per plan §13.6, implement a configurable spam filter for the handleCreateFeedback endpoint that: - Validates minimum content length (default 10 chars, configurable via ACB_SPAM_MIN_LENGTH env var) - Normalizes case and strips common unicode substitutions (leetspeak: 0→o, 1→i, 3→e, 4→a, 5→s, 7→t, @→a, $→s, etc.) - Checks content against a block-list of banned terms with word-boundary matching - Returns HTTP 422 (Unprocessable Entity) on filter rejection Configuration: - ACB_SPAM_BLOCK_LIST: comma-separated custom blocked terms (optional, defaults to embedded list of common spam/offensive words) - ACB_SPAM_MIN_LENGTH: minimum feedback content length (default: 10) The embedded default block-list includes: - Profanity and offensive language - Common spam patterns (buy now, click here, free money, etc.) - Scam patterns (bitcoin giveaway, urgent, act now, etc.) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-04 00:55:47 -04:00 · 2026-05-04 00:55:47 -04:00 · 6dd69f596d
commit 6dd69f596d
parent 39fe612f6a
4 changed files with 359 additions and 5 deletions
--- a/cmd/acb-api/main.go
+++ b/cmd/acb-api/main.go
@ -12,6 +12,7 @@ import (
 	"net/http"
 	"os"
 	"os/signal"
+	"strings"
 	"syscall"
 	"time"

@ -36,6 +37,8 @@ type Config struct {
 	BotTimeoutSecs   int
 	StaleJobMinutes  int
 	MaxConsecFails   int
+	SpamBlockList    string // Comma-separated list of blocked terms (env: ACB_SPAM_BLOCK_LIST)
+	SpamMinLength    int    // Minimum feedback content length (env: ACB_SPAM_MIN_LENGTH)
 }

 func loadConfig() Config {
@ -54,6 +57,8 @@ func loadConfig() Config {
 		BotTimeoutSecs:  envInt("ACB_BOT_TIMEOUT", 5),
 		StaleJobMinutes: envInt("ACB_STALE_JOB_MINUTES", 15),
 		MaxConsecFails:  envInt("ACB_MAX_CONSEC_FAILS", 3),
+		SpamBlockList:   os.Getenv("ACB_SPAM_BLOCK_LIST"),
+		SpamMinLength:   envInt("ACB_SPAM_MIN_LENGTH", 10),
 	}
 }

@ -80,13 +85,25 @@ func main() {
 		cfg:         cfg,
 		db:          db,
 		rdb:         rdb,
-		regLimiter:  ratelimit.NewLimiter(5, 5.0/3600),     // 5/hour per IP
-		feedbackLtr: ratelimit.NewLimiter(20, 20.0/3600),   // 20/hour per IP
-		predictLtr:  ratelimit.NewLimiter(60, 60.0/3600),   // 60/hour per IP
-		submitLtr:   ratelimit.NewLimiter(5, 5.0/86400),    // 5/day per key
-		voteLtr:     ratelimit.NewLimiter(10, 10.0/3600),   // 10/hour per IP
+		regLimiter:  ratelimit.NewLimiter(5, 5.0/3600),   // 5/hour per IP
+		feedbackLtr: ratelimit.NewLimiter(20, 20.0/3600), // 20/hour per IP
+		predictLtr:  ratelimit.NewLimiter(60, 60.0/3600), // 60/hour per IP
+		submitLtr:   ratelimit.NewLimiter(5, 5.0/86400),  // 5/day per key
+		voteLtr:     ratelimit.NewLimiter(10, 10.0/3600), // 10/hour per IP
 	}

+	// Initialize spam filter with configurable block-list
+	var blockList []string
+	if cfg.SpamBlockList != "" {
+		blockList = strings.Split(cfg.SpamBlockList, ",")
+		for i := range blockList {
+			blockList[i] = strings.TrimSpace(blockList[i])
+		}
+	}
+	srv.spamFilter = NewSpamFilter(blockList, cfg.SpamMinLength)
+	log.Printf("[SPAMFILTER] initialized with %d blocked terms, min length %d",
+		srv.spamFilter.BlockedCount(), cfg.SpamMinLength)
+
 	// Periodically purge stale rate-limit buckets (every 10 min)
 	go func() {
 		ticker := time.NewTicker(10 * time.Minute)
--- a/cmd/acb-api/server.go
+++ b/cmd/acb-api/server.go
@ -33,6 +33,7 @@ type Server struct {
 	predictLtr   *ratelimit.Limiter // 60/hour per IP
 	submitLtr    *ratelimit.Limiter // 5/day per bot_id
 	voteLtr      *ratelimit.Limiter // 10/hour per IP
+	spamFilter   *SpamFilter        // word/spam filter for feedback
 }

 func (s *Server) RegisterRoutes(mux *http.ServeMux) {
@ -1495,6 +1496,7 @@ func (s *Server) handlePredictionHistory(w http.ResponseWriter, r *http.Request)
 // handleCreateFeedback handles POST /api/feedback
 // Accepts community replay feedback per plan §13.6.
 // Stores in replay_feedback table with type enum: insight, mistake, idea, highlight.
+// Applies spam filtering: minimum length check and blocked term detection.
 func (s *Server) handleCreateFeedback(w http.ResponseWriter, r *http.Request) {
 	if r.Method != http.MethodPost {
 		writeError(w, http.StatusMethodNotAllowed, "method not allowed")
@ -1528,6 +1530,13 @@ func (s *Server) handleCreateFeedback(w http.ResponseWriter, r *http.Request) {
 		req.Author = "Anonymous"
 	}

+	// Apply spam filter: check minimum length and blocked terms
+	if err := s.spamFilter.Check(req.Body); err != nil {
+		log.Printf("[FEEDBACK] spam filter rejected: match=%s turn=%d type=%s: %v", req.MatchID, req.Turn, req.Type, err)
+		writeError(w, http.StatusUnprocessableEntity, err.Error())
+		return
+	}
+
 	feedbackID, err := generateID("fb_", 6)
 	if err != nil {
 		log.Printf("failed to generate feedback ID: %v", err)
--- a/cmd/acb-api/spamfilter.go
+++ b/cmd/acb-api/spamfilter.go
@ -0,0 +1,155 @@
+package main
+
+import (
+	"fmt"
+	"strings"
+	"unicode"
+)
+
+// SpamFilter provides word filtering for feedback submission.
+// It normalizes case and strips common unicode substitutions before matching.
+type SpamFilter struct {
+	blockedTerms map[string]struct{} // normalized blocked terms
+	minLength    int                  // minimum content length
+}
+
+// Default embedded block-list of common spam/offensive terms.
+var defaultBlockList = []string{
+	// Profanity and offensive language
+	"fuck", "shit", "ass", "bitch", "damn", "crap",
+	// Common spam patterns
+	"buy now", "click here", "free money", "winner", "congratulations",
+	"viagra", "cialis", "porn", "xxx", "casino", "lottery",
+	// Scam patterns
+	"send bitcoin", "crypto giveaway", "urgent", "act now",
+	// All-caps spam (normalized to lowercase)
+	"clickbait", "subscribe", "like and subscribe",
+}
+
+// unicodeReplacements maps common unicode substitutions to their ASCII equivalents.
+var unicodeReplacements = map[rune]rune{
+	'0': 'o',
+	'1': 'i',
+	'3': 'e',
+	'4': 'a',
+	'5': 's',
+	'7': 't',
+	'@': 'a',
+	'$': 's',
+	'+': 't',
+	'|': 'i',
+	'!': 'i',
+	'©': 'c',
+	'®': 'r',
+}
+
+// NewSpamFilter creates a spam filter with the given block-list and minimum length.
+// If blockList is nil, uses the embedded default list.
+// If minLength is 0, defaults to 10 characters.
+func NewSpamFilter(blockList []string, minLength int) *SpamFilter {
+	if minLength == 0 {
+		minLength = 10
+	}
+
+	sf := &SpamFilter{
+		blockedTerms: make(map[string]struct{}),
+		minLength:    minLength,
+	}
+
+	// Use default list if none provided
+	terms := blockList
+	if len(terms) == 0 {
+		terms = defaultBlockList
+	}
+
+	// Normalize and store blocked terms
+	for _, term := range terms {
+		normalized := sf.normalize(term)
+		if normalized != "" {
+			sf.blockedTerms[normalized] = struct{}{}
+		}
+	}
+
+	return sf
+}
+
+// normalize converts text to lowercase and strips common unicode substitutions.
+func (sf *SpamFilter) normalize(s string) string {
+	var result strings.Builder
+	result.Grow(len(s))
+
+	for _, r := range s {
+		// Skip non-printable characters
+		if !unicode.IsPrint(r) && !unicode.IsSpace(r) {
+			continue
+		}
+
+		// Apply unicode replacements
+		if replacement, ok := unicodeReplacements[r]; ok {
+			result.WriteRune(replacement)
+		} else {
+			// Convert to lowercase
+			result.WriteRune(unicode.ToLower(r))
+		}
+	}
+
+	return result.String()
+}
+
+// Check validates content against the spam filter.
+// Returns an error if content is too short or contains blocked terms.
+func (sf *SpamFilter) Check(content string) error {
+	// Check minimum length
+	if len(content) < sf.minLength {
+		return fmt.Errorf("content must be at least %d characters", sf.minLength)
+	}
+
+	// Skip empty check after length validation
+	if content == "" {
+		return fmt.Errorf("content cannot be empty")
+	}
+
+	normalized := sf.normalize(content)
+
+	// Check for blocked terms (word-boundary aware)
+	for blocked := range sf.blockedTerms {
+		if sf.containsWord(normalized, blocked) {
+			return fmt.Errorf("content contains blocked term")
+		}
+	}
+
+	return nil
+}
+
+// containsWord checks if text contains the given word as a whole word (not substring).
+// It handles word boundaries using non-alphanumeric characters.
+func (sf *SpamFilter) containsWord(text, word string) bool {
+	wordLen := len(word)
+	textLen := len(text)
+
+	for i := 0; i <= textLen-wordLen; i++ {
+		// Check if substring matches
+		if text[i:i+wordLen] == word {
+			// Check word boundary before
+			beforeOK := i == 0 || !isAlphanumeric(text[i-1])
+			// Check word boundary after
+			afterOK := (i+wordLen) >= textLen || !isAlphanumeric(text[i+wordLen])
+
+			if beforeOK && afterOK {
+				return true
+			}
+		}
+	}
+
+	return false
+}
+
+// isAlphanumeric returns true if the byte is a letter or digit.
+func isAlphanumeric(b byte) bool {
+	return (b >= 'a' && b <= 'z') || (b >= '0' && b <= '9')
+}
+
+// BlockedCount returns the number of blocked terms in the filter.
+func (sf *SpamFilter) BlockedCount() int {
+	return len(sf.blockedTerms)
+}
--- a/cmd/acb-api/spamfilter_test.go
+++ b/cmd/acb-api/spamfilter_test.go
@ -0,0 +1,173 @@
+package main
+
+import (
+	"testing"
+)
+
+func TestSpamFilter_MinLength(t *testing.T) {
+	sf := NewSpamFilter(nil, 10)
+
+	tests := []struct {
+		name    string
+		content string
+		wantErr bool
+	}{
+		{"empty", "", true},
+		{"too short", "hi", true},
+		{"exactly min", "1234567890", false},
+		{"above min", "this is valid feedback", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := sf.Check(tt.content)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("Check() error = %v, wantErr %v", err, tt.wantErr)
+			}
+		})
+	}
+}
+
+func TestSpamFilter_BlockedTerms(t *testing.T) {
+	customList := []string{"spam", "scam", "viagra"}
+	sf := NewSpamFilter(customList, 5)
+
+	tests := []struct {
+		name    string
+		content string
+		wantErr bool
+	}{
+		{"clean content", "this is good feedback", false},
+		{"exact blocked", "spam here", true},
+		{"blocked at start", "scam alert", true},
+		{"blocked at end", "buy viagra", true},
+		{"blocked in middle", "this is a scam attempt", true},
+		{"case insensitive", "SPAM everywhere", true},
+		{"mixed case", "VIAGRA pills", true},
+		{"substring not blocked", "spamming is okay", false}, // "spamming" != "spam"
+		{"partial word not blocked", "this is spammy", false}, // "spammy" != "spam"
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := sf.Check(tt.content)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("Check() error = %v, wantErr %v", err, tt.wantErr)
+			}
+		})
+	}
+}
+
+func TestSpamFilter_UnicodeNormalization(t *testing.T) {
+	customList := []string{"viagra", "casino"}
+	sf := NewSpamFilter(customList, 5)
+
+	tests := []struct {
+		name    string
+		content string
+		wantErr bool
+	}{
+		{"leetspeak 0", "v1agra pills", true},
+		{"leetspeak 1", "v1@gra pills", true},
+		{"leetspeak 3", "v1agr@ is bad", true},
+		{"leetspeak 4", "c@s1n0 royale", true},
+		{"leetspeak 5", "c451n0 royale", true},
+		{"leetspeak 7", "c@5in0 royale", true},
+		{"mixed leetspeak", "v1@gr@ and c@s1n0", true},
+		{"clean content", "this is okay", false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := sf.Check(tt.content)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("Check() error = %v, wantErr %v", err, tt.wantErr)
+			}
+		})
+	}
+}
+
+func TestSpamFilter_DefaultBlockList(t *testing.T) {
+	sf := NewSpamFilter(nil, 5)
+
+	// Check that default block-list has terms
+	if sf.BlockedCount() == 0 {
+		t.Error("default block list is empty")
+	}
+
+	// Test some terms from the default list
+	tests := []struct {
+		content string
+		wantErr bool
+	}{
+		{"this is good feedback", false},
+		{"buy now click here", true},
+		{"free money winner", true},
+		{"send bitcoin scam", true},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.content, func(t *testing.T) {
+			err := sf.Check(tt.content)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("Check(%q) error = %v, wantErr %v", tt.content, err, tt.wantErr)
+			}
+		})
+	}
+}
+
+func TestSpamFilter_WordBoundaries(t *testing.T) {
+	// Test that word boundaries are respected
+	customList := []string{"ass", "casino"}
+	sf := NewSpamFilter(customList, 5)
+
+	tests := []struct {
+		name    string
+		content string
+		wantErr bool
+	}{
+		{"exact match", "ass", true},
+		{"with space before", " this ass", true},
+		{"with space after", "ass ", true},
+		{"in middle", "this ass here", true},
+		{"with punctuation", "ass.", true},
+		{"substring should not match", "this is classic", false},    // "ass" in "classic"
+		{"substring should not match 2", "cassandra is cool", false}, // "ass" in "cassandra"
+		{"casino exact", "casino", true},
+		{"casino plural", "casinos", false}, // different word
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := sf.Check(tt.content)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("Check(%q) error = %v, wantErr %v", tt.content, err, tt.wantErr)
+			}
+		})
+	}
+}
+
+func TestNormalize(t *testing.T) {
+	sf := NewSpamFilter(nil, 5)
+
+	tests := []struct {
+		input    string
+		expected string
+	}{
+		{"ViAgRA", "viagra"},
+		{"V1@GR@", "viagra"},   // 1→i, @→a
+		{"C451N0", "casino"},   // 4→a, 5→s, 0→o, 1→i
+		{"Test!", "testi"},     // !→i
+		{"Mixed CASE", "mixed case"},
+		{"0wned", "owned"},     // 0→o
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.input, func(t *testing.T) {
+			got := sf.normalize(tt.input)
+			if got != tt.expected {
+				t.Errorf("normalize(%q) = %q, want %q", tt.input, got, tt.expected)
+			}
+		})
+	}
+}