feat(api): add spam/word filter for feedback submission
Per plan §13.6, implement a configurable spam filter for the handleCreateFeedback endpoint that: - Validates minimum content length (default 10 chars, configurable via ACB_SPAM_MIN_LENGTH env var) - Normalizes case and strips common unicode substitutions (leetspeak: 0→o, 1→i, 3→e, 4→a, 5→s, 7→t, @→a, $→s, etc.) - Checks content against a block-list of banned terms with word-boundary matching - Returns HTTP 422 (Unprocessable Entity) on filter rejection Configuration: - ACB_SPAM_BLOCK_LIST: comma-separated custom blocked terms (optional, defaults to embedded list of common spam/offensive words) - ACB_SPAM_MIN_LENGTH: minimum feedback content length (default: 10) The embedded default block-list includes: - Profanity and offensive language - Common spam patterns (buy now, click here, free money, etc.) - Scam patterns (bitcoin giveaway, urgent, act now, etc.) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
39fe612f6a
commit
6dd69f596d
4 changed files with 359 additions and 5 deletions
|
|
@ -12,6 +12,7 @@ import (
|
|||
"net/http"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
|
|
@ -36,6 +37,8 @@ type Config struct {
|
|||
BotTimeoutSecs int
|
||||
StaleJobMinutes int
|
||||
MaxConsecFails int
|
||||
SpamBlockList string // Comma-separated list of blocked terms (env: ACB_SPAM_BLOCK_LIST)
|
||||
SpamMinLength int // Minimum feedback content length (env: ACB_SPAM_MIN_LENGTH)
|
||||
}
|
||||
|
||||
func loadConfig() Config {
|
||||
|
|
@ -54,6 +57,8 @@ func loadConfig() Config {
|
|||
BotTimeoutSecs: envInt("ACB_BOT_TIMEOUT", 5),
|
||||
StaleJobMinutes: envInt("ACB_STALE_JOB_MINUTES", 15),
|
||||
MaxConsecFails: envInt("ACB_MAX_CONSEC_FAILS", 3),
|
||||
SpamBlockList: os.Getenv("ACB_SPAM_BLOCK_LIST"),
|
||||
SpamMinLength: envInt("ACB_SPAM_MIN_LENGTH", 10),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -80,13 +85,25 @@ func main() {
|
|||
cfg: cfg,
|
||||
db: db,
|
||||
rdb: rdb,
|
||||
regLimiter: ratelimit.NewLimiter(5, 5.0/3600), // 5/hour per IP
|
||||
feedbackLtr: ratelimit.NewLimiter(20, 20.0/3600), // 20/hour per IP
|
||||
predictLtr: ratelimit.NewLimiter(60, 60.0/3600), // 60/hour per IP
|
||||
submitLtr: ratelimit.NewLimiter(5, 5.0/86400), // 5/day per key
|
||||
voteLtr: ratelimit.NewLimiter(10, 10.0/3600), // 10/hour per IP
|
||||
regLimiter: ratelimit.NewLimiter(5, 5.0/3600), // 5/hour per IP
|
||||
feedbackLtr: ratelimit.NewLimiter(20, 20.0/3600), // 20/hour per IP
|
||||
predictLtr: ratelimit.NewLimiter(60, 60.0/3600), // 60/hour per IP
|
||||
submitLtr: ratelimit.NewLimiter(5, 5.0/86400), // 5/day per key
|
||||
voteLtr: ratelimit.NewLimiter(10, 10.0/3600), // 10/hour per IP
|
||||
}
|
||||
|
||||
// Initialize spam filter with configurable block-list
|
||||
var blockList []string
|
||||
if cfg.SpamBlockList != "" {
|
||||
blockList = strings.Split(cfg.SpamBlockList, ",")
|
||||
for i := range blockList {
|
||||
blockList[i] = strings.TrimSpace(blockList[i])
|
||||
}
|
||||
}
|
||||
srv.spamFilter = NewSpamFilter(blockList, cfg.SpamMinLength)
|
||||
log.Printf("[SPAMFILTER] initialized with %d blocked terms, min length %d",
|
||||
srv.spamFilter.BlockedCount(), cfg.SpamMinLength)
|
||||
|
||||
// Periodically purge stale rate-limit buckets (every 10 min)
|
||||
go func() {
|
||||
ticker := time.NewTicker(10 * time.Minute)
|
||||
|
|
|
|||
|
|
@ -33,6 +33,7 @@ type Server struct {
|
|||
predictLtr *ratelimit.Limiter // 60/hour per IP
|
||||
submitLtr *ratelimit.Limiter // 5/day per bot_id
|
||||
voteLtr *ratelimit.Limiter // 10/hour per IP
|
||||
spamFilter *SpamFilter // word/spam filter for feedback
|
||||
}
|
||||
|
||||
func (s *Server) RegisterRoutes(mux *http.ServeMux) {
|
||||
|
|
@ -1495,6 +1496,7 @@ func (s *Server) handlePredictionHistory(w http.ResponseWriter, r *http.Request)
|
|||
// handleCreateFeedback handles POST /api/feedback
|
||||
// Accepts community replay feedback per plan §13.6.
|
||||
// Stores in replay_feedback table with type enum: insight, mistake, idea, highlight.
|
||||
// Applies spam filtering: minimum length check and blocked term detection.
|
||||
func (s *Server) handleCreateFeedback(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
writeError(w, http.StatusMethodNotAllowed, "method not allowed")
|
||||
|
|
@ -1528,6 +1530,13 @@ func (s *Server) handleCreateFeedback(w http.ResponseWriter, r *http.Request) {
|
|||
req.Author = "Anonymous"
|
||||
}
|
||||
|
||||
// Apply spam filter: check minimum length and blocked terms
|
||||
if err := s.spamFilter.Check(req.Body); err != nil {
|
||||
log.Printf("[FEEDBACK] spam filter rejected: match=%s turn=%d type=%s: %v", req.MatchID, req.Turn, req.Type, err)
|
||||
writeError(w, http.StatusUnprocessableEntity, err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
feedbackID, err := generateID("fb_", 6)
|
||||
if err != nil {
|
||||
log.Printf("failed to generate feedback ID: %v", err)
|
||||
|
|
|
|||
155
cmd/acb-api/spamfilter.go
Normal file
155
cmd/acb-api/spamfilter.go
Normal file
|
|
@ -0,0 +1,155 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// SpamFilter provides word filtering for feedback submission.
|
||||
// It normalizes case and strips common unicode substitutions before matching.
|
||||
type SpamFilter struct {
|
||||
blockedTerms map[string]struct{} // normalized blocked terms
|
||||
minLength int // minimum content length
|
||||
}
|
||||
|
||||
// Default embedded block-list of common spam/offensive terms.
|
||||
var defaultBlockList = []string{
|
||||
// Profanity and offensive language
|
||||
"fuck", "shit", "ass", "bitch", "damn", "crap",
|
||||
// Common spam patterns
|
||||
"buy now", "click here", "free money", "winner", "congratulations",
|
||||
"viagra", "cialis", "porn", "xxx", "casino", "lottery",
|
||||
// Scam patterns
|
||||
"send bitcoin", "crypto giveaway", "urgent", "act now",
|
||||
// All-caps spam (normalized to lowercase)
|
||||
"clickbait", "subscribe", "like and subscribe",
|
||||
}
|
||||
|
||||
// unicodeReplacements maps common unicode substitutions to their ASCII equivalents.
|
||||
var unicodeReplacements = map[rune]rune{
|
||||
'0': 'o',
|
||||
'1': 'i',
|
||||
'3': 'e',
|
||||
'4': 'a',
|
||||
'5': 's',
|
||||
'7': 't',
|
||||
'@': 'a',
|
||||
'$': 's',
|
||||
'+': 't',
|
||||
'|': 'i',
|
||||
'!': 'i',
|
||||
'©': 'c',
|
||||
'®': 'r',
|
||||
}
|
||||
|
||||
// NewSpamFilter creates a spam filter with the given block-list and minimum length.
|
||||
// If blockList is nil, uses the embedded default list.
|
||||
// If minLength is 0, defaults to 10 characters.
|
||||
func NewSpamFilter(blockList []string, minLength int) *SpamFilter {
|
||||
if minLength == 0 {
|
||||
minLength = 10
|
||||
}
|
||||
|
||||
sf := &SpamFilter{
|
||||
blockedTerms: make(map[string]struct{}),
|
||||
minLength: minLength,
|
||||
}
|
||||
|
||||
// Use default list if none provided
|
||||
terms := blockList
|
||||
if len(terms) == 0 {
|
||||
terms = defaultBlockList
|
||||
}
|
||||
|
||||
// Normalize and store blocked terms
|
||||
for _, term := range terms {
|
||||
normalized := sf.normalize(term)
|
||||
if normalized != "" {
|
||||
sf.blockedTerms[normalized] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
return sf
|
||||
}
|
||||
|
||||
// normalize converts text to lowercase and strips common unicode substitutions.
|
||||
func (sf *SpamFilter) normalize(s string) string {
|
||||
var result strings.Builder
|
||||
result.Grow(len(s))
|
||||
|
||||
for _, r := range s {
|
||||
// Skip non-printable characters
|
||||
if !unicode.IsPrint(r) && !unicode.IsSpace(r) {
|
||||
continue
|
||||
}
|
||||
|
||||
// Apply unicode replacements
|
||||
if replacement, ok := unicodeReplacements[r]; ok {
|
||||
result.WriteRune(replacement)
|
||||
} else {
|
||||
// Convert to lowercase
|
||||
result.WriteRune(unicode.ToLower(r))
|
||||
}
|
||||
}
|
||||
|
||||
return result.String()
|
||||
}
|
||||
|
||||
// Check validates content against the spam filter.
|
||||
// Returns an error if content is too short or contains blocked terms.
|
||||
func (sf *SpamFilter) Check(content string) error {
|
||||
// Check minimum length
|
||||
if len(content) < sf.minLength {
|
||||
return fmt.Errorf("content must be at least %d characters", sf.minLength)
|
||||
}
|
||||
|
||||
// Skip empty check after length validation
|
||||
if content == "" {
|
||||
return fmt.Errorf("content cannot be empty")
|
||||
}
|
||||
|
||||
normalized := sf.normalize(content)
|
||||
|
||||
// Check for blocked terms (word-boundary aware)
|
||||
for blocked := range sf.blockedTerms {
|
||||
if sf.containsWord(normalized, blocked) {
|
||||
return fmt.Errorf("content contains blocked term")
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// containsWord checks if text contains the given word as a whole word (not substring).
|
||||
// It handles word boundaries using non-alphanumeric characters.
|
||||
func (sf *SpamFilter) containsWord(text, word string) bool {
|
||||
wordLen := len(word)
|
||||
textLen := len(text)
|
||||
|
||||
for i := 0; i <= textLen-wordLen; i++ {
|
||||
// Check if substring matches
|
||||
if text[i:i+wordLen] == word {
|
||||
// Check word boundary before
|
||||
beforeOK := i == 0 || !isAlphanumeric(text[i-1])
|
||||
// Check word boundary after
|
||||
afterOK := (i+wordLen) >= textLen || !isAlphanumeric(text[i+wordLen])
|
||||
|
||||
if beforeOK && afterOK {
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// isAlphanumeric returns true if the byte is a letter or digit.
|
||||
func isAlphanumeric(b byte) bool {
|
||||
return (b >= 'a' && b <= 'z') || (b >= '0' && b <= '9')
|
||||
}
|
||||
|
||||
// BlockedCount returns the number of blocked terms in the filter.
|
||||
func (sf *SpamFilter) BlockedCount() int {
|
||||
return len(sf.blockedTerms)
|
||||
}
|
||||
173
cmd/acb-api/spamfilter_test.go
Normal file
173
cmd/acb-api/spamfilter_test.go
Normal file
|
|
@ -0,0 +1,173 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestSpamFilter_MinLength(t *testing.T) {
|
||||
sf := NewSpamFilter(nil, 10)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
content string
|
||||
wantErr bool
|
||||
}{
|
||||
{"empty", "", true},
|
||||
{"too short", "hi", true},
|
||||
{"exactly min", "1234567890", false},
|
||||
{"above min", "this is valid feedback", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
err := sf.Check(tt.content)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("Check() error = %v, wantErr %v", err, tt.wantErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSpamFilter_BlockedTerms(t *testing.T) {
|
||||
customList := []string{"spam", "scam", "viagra"}
|
||||
sf := NewSpamFilter(customList, 5)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
content string
|
||||
wantErr bool
|
||||
}{
|
||||
{"clean content", "this is good feedback", false},
|
||||
{"exact blocked", "spam here", true},
|
||||
{"blocked at start", "scam alert", true},
|
||||
{"blocked at end", "buy viagra", true},
|
||||
{"blocked in middle", "this is a scam attempt", true},
|
||||
{"case insensitive", "SPAM everywhere", true},
|
||||
{"mixed case", "VIAGRA pills", true},
|
||||
{"substring not blocked", "spamming is okay", false}, // "spamming" != "spam"
|
||||
{"partial word not blocked", "this is spammy", false}, // "spammy" != "spam"
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
err := sf.Check(tt.content)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("Check() error = %v, wantErr %v", err, tt.wantErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSpamFilter_UnicodeNormalization(t *testing.T) {
|
||||
customList := []string{"viagra", "casino"}
|
||||
sf := NewSpamFilter(customList, 5)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
content string
|
||||
wantErr bool
|
||||
}{
|
||||
{"leetspeak 0", "v1agra pills", true},
|
||||
{"leetspeak 1", "v1@gra pills", true},
|
||||
{"leetspeak 3", "v1agr@ is bad", true},
|
||||
{"leetspeak 4", "c@s1n0 royale", true},
|
||||
{"leetspeak 5", "c451n0 royale", true},
|
||||
{"leetspeak 7", "c@5in0 royale", true},
|
||||
{"mixed leetspeak", "v1@gr@ and c@s1n0", true},
|
||||
{"clean content", "this is okay", false},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
err := sf.Check(tt.content)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("Check() error = %v, wantErr %v", err, tt.wantErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSpamFilter_DefaultBlockList(t *testing.T) {
|
||||
sf := NewSpamFilter(nil, 5)
|
||||
|
||||
// Check that default block-list has terms
|
||||
if sf.BlockedCount() == 0 {
|
||||
t.Error("default block list is empty")
|
||||
}
|
||||
|
||||
// Test some terms from the default list
|
||||
tests := []struct {
|
||||
content string
|
||||
wantErr bool
|
||||
}{
|
||||
{"this is good feedback", false},
|
||||
{"buy now click here", true},
|
||||
{"free money winner", true},
|
||||
{"send bitcoin scam", true},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.content, func(t *testing.T) {
|
||||
err := sf.Check(tt.content)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("Check(%q) error = %v, wantErr %v", tt.content, err, tt.wantErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestSpamFilter_WordBoundaries(t *testing.T) {
|
||||
// Test that word boundaries are respected
|
||||
customList := []string{"ass", "casino"}
|
||||
sf := NewSpamFilter(customList, 5)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
content string
|
||||
wantErr bool
|
||||
}{
|
||||
{"exact match", "ass", true},
|
||||
{"with space before", " this ass", true},
|
||||
{"with space after", "ass ", true},
|
||||
{"in middle", "this ass here", true},
|
||||
{"with punctuation", "ass.", true},
|
||||
{"substring should not match", "this is classic", false}, // "ass" in "classic"
|
||||
{"substring should not match 2", "cassandra is cool", false}, // "ass" in "cassandra"
|
||||
{"casino exact", "casino", true},
|
||||
{"casino plural", "casinos", false}, // different word
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
err := sf.Check(tt.content)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("Check(%q) error = %v, wantErr %v", tt.content, err, tt.wantErr)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestNormalize(t *testing.T) {
|
||||
sf := NewSpamFilter(nil, 5)
|
||||
|
||||
tests := []struct {
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{"ViAgRA", "viagra"},
|
||||
{"V1@GR@", "viagra"}, // 1→i, @→a
|
||||
{"C451N0", "casino"}, // 4→a, 5→s, 0→o, 1→i
|
||||
{"Test!", "testi"}, // !→i
|
||||
{"Mixed CASE", "mixed case"},
|
||||
{"0wned", "owned"}, // 0→o
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.input, func(t *testing.T) {
|
||||
got := sf.normalize(tt.input)
|
||||
if got != tt.expected {
|
||||
t.Errorf("normalize(%q) = %q, want %q", tt.input, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue