feat(api): add spam/word filter for feedback submission

Per plan §13.6, implement a configurable spam filter for the
handleCreateFeedback endpoint that:

- Validates minimum content length (default 10 chars, configurable
  via ACB_SPAM_MIN_LENGTH env var)
- Normalizes case and strips common unicode substitutions
  (leetspeak: 0→o, 1→i, 3→e, 4→a, 5→s, 7→t, @→a, $→s, etc.)
- Checks content against a block-list of banned terms with word-boundary
  matching
- Returns HTTP 422 (Unprocessable Entity) on filter rejection

Configuration:
- ACB_SPAM_BLOCK_LIST: comma-separated custom blocked terms (optional,
  defaults to embedded list of common spam/offensive words)
- ACB_SPAM_MIN_LENGTH: minimum feedback content length (default: 10)

The embedded default block-list includes:
- Profanity and offensive language
- Common spam patterns (buy now, click here, free money, etc.)
- Scam patterns (bitcoin giveaway, urgent, act now, etc.)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
jedarden 2026-05-04 00:55:47 -04:00
parent 39fe612f6a
commit 6dd69f596d
4 changed files with 359 additions and 5 deletions

View file

@ -12,6 +12,7 @@ import (
"net/http"
"os"
"os/signal"
"strings"
"syscall"
"time"
@ -36,6 +37,8 @@ type Config struct {
BotTimeoutSecs int
StaleJobMinutes int
MaxConsecFails int
SpamBlockList string // Comma-separated list of blocked terms (env: ACB_SPAM_BLOCK_LIST)
SpamMinLength int // Minimum feedback content length (env: ACB_SPAM_MIN_LENGTH)
}
func loadConfig() Config {
@ -54,6 +57,8 @@ func loadConfig() Config {
BotTimeoutSecs: envInt("ACB_BOT_TIMEOUT", 5),
StaleJobMinutes: envInt("ACB_STALE_JOB_MINUTES", 15),
MaxConsecFails: envInt("ACB_MAX_CONSEC_FAILS", 3),
SpamBlockList: os.Getenv("ACB_SPAM_BLOCK_LIST"),
SpamMinLength: envInt("ACB_SPAM_MIN_LENGTH", 10),
}
}
@ -80,13 +85,25 @@ func main() {
cfg: cfg,
db: db,
rdb: rdb,
regLimiter: ratelimit.NewLimiter(5, 5.0/3600), // 5/hour per IP
feedbackLtr: ratelimit.NewLimiter(20, 20.0/3600), // 20/hour per IP
predictLtr: ratelimit.NewLimiter(60, 60.0/3600), // 60/hour per IP
submitLtr: ratelimit.NewLimiter(5, 5.0/86400), // 5/day per key
voteLtr: ratelimit.NewLimiter(10, 10.0/3600), // 10/hour per IP
regLimiter: ratelimit.NewLimiter(5, 5.0/3600), // 5/hour per IP
feedbackLtr: ratelimit.NewLimiter(20, 20.0/3600), // 20/hour per IP
predictLtr: ratelimit.NewLimiter(60, 60.0/3600), // 60/hour per IP
submitLtr: ratelimit.NewLimiter(5, 5.0/86400), // 5/day per key
voteLtr: ratelimit.NewLimiter(10, 10.0/3600), // 10/hour per IP
}
// Initialize spam filter with configurable block-list
var blockList []string
if cfg.SpamBlockList != "" {
blockList = strings.Split(cfg.SpamBlockList, ",")
for i := range blockList {
blockList[i] = strings.TrimSpace(blockList[i])
}
}
srv.spamFilter = NewSpamFilter(blockList, cfg.SpamMinLength)
log.Printf("[SPAMFILTER] initialized with %d blocked terms, min length %d",
srv.spamFilter.BlockedCount(), cfg.SpamMinLength)
// Periodically purge stale rate-limit buckets (every 10 min)
go func() {
ticker := time.NewTicker(10 * time.Minute)

View file

@ -33,6 +33,7 @@ type Server struct {
predictLtr *ratelimit.Limiter // 60/hour per IP
submitLtr *ratelimit.Limiter // 5/day per bot_id
voteLtr *ratelimit.Limiter // 10/hour per IP
spamFilter *SpamFilter // word/spam filter for feedback
}
func (s *Server) RegisterRoutes(mux *http.ServeMux) {
@ -1495,6 +1496,7 @@ func (s *Server) handlePredictionHistory(w http.ResponseWriter, r *http.Request)
// handleCreateFeedback handles POST /api/feedback
// Accepts community replay feedback per plan §13.6.
// Stores in replay_feedback table with type enum: insight, mistake, idea, highlight.
// Applies spam filtering: minimum length check and blocked term detection.
func (s *Server) handleCreateFeedback(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
@ -1528,6 +1530,13 @@ func (s *Server) handleCreateFeedback(w http.ResponseWriter, r *http.Request) {
req.Author = "Anonymous"
}
// Apply spam filter: check minimum length and blocked terms
if err := s.spamFilter.Check(req.Body); err != nil {
log.Printf("[FEEDBACK] spam filter rejected: match=%s turn=%d type=%s: %v", req.MatchID, req.Turn, req.Type, err)
writeError(w, http.StatusUnprocessableEntity, err.Error())
return
}
feedbackID, err := generateID("fb_", 6)
if err != nil {
log.Printf("failed to generate feedback ID: %v", err)

155
cmd/acb-api/spamfilter.go Normal file
View file

@ -0,0 +1,155 @@
package main
import (
"fmt"
"strings"
"unicode"
)
// SpamFilter provides word filtering for feedback submission.
// It normalizes case and strips common unicode substitutions before matching.
type SpamFilter struct {
blockedTerms map[string]struct{} // normalized blocked terms
minLength int // minimum content length
}
// Default embedded block-list of common spam/offensive terms.
var defaultBlockList = []string{
// Profanity and offensive language
"fuck", "shit", "ass", "bitch", "damn", "crap",
// Common spam patterns
"buy now", "click here", "free money", "winner", "congratulations",
"viagra", "cialis", "porn", "xxx", "casino", "lottery",
// Scam patterns
"send bitcoin", "crypto giveaway", "urgent", "act now",
// All-caps spam (normalized to lowercase)
"clickbait", "subscribe", "like and subscribe",
}
// unicodeReplacements maps common unicode substitutions to their ASCII equivalents.
var unicodeReplacements = map[rune]rune{
'0': 'o',
'1': 'i',
'3': 'e',
'4': 'a',
'5': 's',
'7': 't',
'@': 'a',
'$': 's',
'+': 't',
'|': 'i',
'!': 'i',
'©': 'c',
'®': 'r',
}
// NewSpamFilter creates a spam filter with the given block-list and minimum length.
// If blockList is nil, uses the embedded default list.
// If minLength is 0, defaults to 10 characters.
func NewSpamFilter(blockList []string, minLength int) *SpamFilter {
if minLength == 0 {
minLength = 10
}
sf := &SpamFilter{
blockedTerms: make(map[string]struct{}),
minLength: minLength,
}
// Use default list if none provided
terms := blockList
if len(terms) == 0 {
terms = defaultBlockList
}
// Normalize and store blocked terms
for _, term := range terms {
normalized := sf.normalize(term)
if normalized != "" {
sf.blockedTerms[normalized] = struct{}{}
}
}
return sf
}
// normalize converts text to lowercase and strips common unicode substitutions.
func (sf *SpamFilter) normalize(s string) string {
var result strings.Builder
result.Grow(len(s))
for _, r := range s {
// Skip non-printable characters
if !unicode.IsPrint(r) && !unicode.IsSpace(r) {
continue
}
// Apply unicode replacements
if replacement, ok := unicodeReplacements[r]; ok {
result.WriteRune(replacement)
} else {
// Convert to lowercase
result.WriteRune(unicode.ToLower(r))
}
}
return result.String()
}
// Check validates content against the spam filter.
// Returns an error if content is too short or contains blocked terms.
func (sf *SpamFilter) Check(content string) error {
// Check minimum length
if len(content) < sf.minLength {
return fmt.Errorf("content must be at least %d characters", sf.minLength)
}
// Skip empty check after length validation
if content == "" {
return fmt.Errorf("content cannot be empty")
}
normalized := sf.normalize(content)
// Check for blocked terms (word-boundary aware)
for blocked := range sf.blockedTerms {
if sf.containsWord(normalized, blocked) {
return fmt.Errorf("content contains blocked term")
}
}
return nil
}
// containsWord checks if text contains the given word as a whole word (not substring).
// It handles word boundaries using non-alphanumeric characters.
func (sf *SpamFilter) containsWord(text, word string) bool {
wordLen := len(word)
textLen := len(text)
for i := 0; i <= textLen-wordLen; i++ {
// Check if substring matches
if text[i:i+wordLen] == word {
// Check word boundary before
beforeOK := i == 0 || !isAlphanumeric(text[i-1])
// Check word boundary after
afterOK := (i+wordLen) >= textLen || !isAlphanumeric(text[i+wordLen])
if beforeOK && afterOK {
return true
}
}
}
return false
}
// isAlphanumeric returns true if the byte is a letter or digit.
func isAlphanumeric(b byte) bool {
return (b >= 'a' && b <= 'z') || (b >= '0' && b <= '9')
}
// BlockedCount returns the number of blocked terms in the filter.
func (sf *SpamFilter) BlockedCount() int {
return len(sf.blockedTerms)
}

View file

@ -0,0 +1,173 @@
package main
import (
"testing"
)
func TestSpamFilter_MinLength(t *testing.T) {
sf := NewSpamFilter(nil, 10)
tests := []struct {
name string
content string
wantErr bool
}{
{"empty", "", true},
{"too short", "hi", true},
{"exactly min", "1234567890", false},
{"above min", "this is valid feedback", false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := sf.Check(tt.content)
if (err != nil) != tt.wantErr {
t.Errorf("Check() error = %v, wantErr %v", err, tt.wantErr)
}
})
}
}
func TestSpamFilter_BlockedTerms(t *testing.T) {
customList := []string{"spam", "scam", "viagra"}
sf := NewSpamFilter(customList, 5)
tests := []struct {
name string
content string
wantErr bool
}{
{"clean content", "this is good feedback", false},
{"exact blocked", "spam here", true},
{"blocked at start", "scam alert", true},
{"blocked at end", "buy viagra", true},
{"blocked in middle", "this is a scam attempt", true},
{"case insensitive", "SPAM everywhere", true},
{"mixed case", "VIAGRA pills", true},
{"substring not blocked", "spamming is okay", false}, // "spamming" != "spam"
{"partial word not blocked", "this is spammy", false}, // "spammy" != "spam"
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := sf.Check(tt.content)
if (err != nil) != tt.wantErr {
t.Errorf("Check() error = %v, wantErr %v", err, tt.wantErr)
}
})
}
}
func TestSpamFilter_UnicodeNormalization(t *testing.T) {
customList := []string{"viagra", "casino"}
sf := NewSpamFilter(customList, 5)
tests := []struct {
name string
content string
wantErr bool
}{
{"leetspeak 0", "v1agra pills", true},
{"leetspeak 1", "v1@gra pills", true},
{"leetspeak 3", "v1agr@ is bad", true},
{"leetspeak 4", "c@s1n0 royale", true},
{"leetspeak 5", "c451n0 royale", true},
{"leetspeak 7", "c@5in0 royale", true},
{"mixed leetspeak", "v1@gr@ and c@s1n0", true},
{"clean content", "this is okay", false},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := sf.Check(tt.content)
if (err != nil) != tt.wantErr {
t.Errorf("Check() error = %v, wantErr %v", err, tt.wantErr)
}
})
}
}
func TestSpamFilter_DefaultBlockList(t *testing.T) {
sf := NewSpamFilter(nil, 5)
// Check that default block-list has terms
if sf.BlockedCount() == 0 {
t.Error("default block list is empty")
}
// Test some terms from the default list
tests := []struct {
content string
wantErr bool
}{
{"this is good feedback", false},
{"buy now click here", true},
{"free money winner", true},
{"send bitcoin scam", true},
}
for _, tt := range tests {
t.Run(tt.content, func(t *testing.T) {
err := sf.Check(tt.content)
if (err != nil) != tt.wantErr {
t.Errorf("Check(%q) error = %v, wantErr %v", tt.content, err, tt.wantErr)
}
})
}
}
func TestSpamFilter_WordBoundaries(t *testing.T) {
// Test that word boundaries are respected
customList := []string{"ass", "casino"}
sf := NewSpamFilter(customList, 5)
tests := []struct {
name string
content string
wantErr bool
}{
{"exact match", "ass", true},
{"with space before", " this ass", true},
{"with space after", "ass ", true},
{"in middle", "this ass here", true},
{"with punctuation", "ass.", true},
{"substring should not match", "this is classic", false}, // "ass" in "classic"
{"substring should not match 2", "cassandra is cool", false}, // "ass" in "cassandra"
{"casino exact", "casino", true},
{"casino plural", "casinos", false}, // different word
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := sf.Check(tt.content)
if (err != nil) != tt.wantErr {
t.Errorf("Check(%q) error = %v, wantErr %v", tt.content, err, tt.wantErr)
}
})
}
}
func TestNormalize(t *testing.T) {
sf := NewSpamFilter(nil, 5)
tests := []struct {
input string
expected string
}{
{"ViAgRA", "viagra"},
{"V1@GR@", "viagra"}, // 1→i, @→a
{"C451N0", "casino"}, // 4→a, 5→s, 0→o, 1→i
{"Test!", "testi"}, // !→i
{"Mixed CASE", "mixed case"},
{"0wned", "owned"}, // 0→o
}
for _, tt := range tests {
t.Run(tt.input, func(t *testing.T) {
got := sf.normalize(tt.input)
if got != tt.expected {
t.Errorf("normalize(%q) = %q, want %q", tt.input, got, tt.expected)
}
})
}
}