Sync from /srv/compose/unified-media-manager

This commit is contained in:
Christopher Mayor
2026-04-24 10:45:19 -07:00
commit 7dbd00e537
132 changed files with 25394 additions and 0 deletions

View File

@@ -0,0 +1,287 @@
package cardigann
import (
"fmt"
yaml "gopkg.in/yaml.v3"
)
// Definition represents a parsed Cardigann YAML indexer definition.
// It matches the upstream Cardigann schema for site definitions.
type Definition struct {
Site string `yaml:"site"`
Name string `yaml:"name"`
Description string `yaml:"description"`
Language string `yaml:"language"`
Encoding string `yaml:"encoding"`
Links StringOrSlice `yaml:"links"`
Settings []SettingsField `yaml:"settings"`
Caps CapabilitiesBlock `yaml:"caps"`
Login LoginBlock `yaml:"login"`
Ratio RatioBlock `yaml:"ratio"`
Search SearchBlock `yaml:"search"`
}
// SettingsField describes a user-configurable field in the definition.
type SettingsField struct {
Name string `yaml:"name"`
Type string `yaml:"type"`
Label string `yaml:"label"`
}
// CapabilitiesBlock maps categories and search modes.
type CapabilitiesBlock struct {
Categories map[string]string `yaml:"categories"`
Modes map[string][]string `yaml:"modes"`
}
// LoginBlock describes authentication configuration.
type LoginBlock struct {
Path string `yaml:"path"`
Method string `yaml:"method"`
Form string `yaml:"form"`
Inputs map[string]string `yaml:"inputs"`
Error []ErrorBlock `yaml:"error"`
Test PageTestBlock `yaml:"test"`
}
// ErrorBlock describes an error detection pattern.
type ErrorBlock struct {
Path string `yaml:"path"`
Selector string `yaml:"selector"`
Message SelectorBlock `yaml:"message"`
}
// PageTestBlock describes a page test for verifying login.
type PageTestBlock struct {
Path string `yaml:"path"`
Selector string `yaml:"selector"`
}
// SearchBlock describes search configuration.
type SearchBlock struct {
Path string `yaml:"path"`
Method string `yaml:"method"`
Inputs map[string]string `yaml:"inputs"`
Rows RowsBlock `yaml:"rows"`
Fields FieldsListBlock `yaml:"fields"`
}
// RowsBlock describes how to find result rows in HTML.
type RowsBlock struct {
Selector string `yaml:"selector"`
Remove string `yaml:"remove"`
After int `yaml:"after"`
DateHeaders SelectorBlock `yaml:"dateheaders"`
}
// FieldBlock represents a single field extraction definition.
type FieldBlock struct {
Field string `yaml:"field"`
Block SelectorBlock `yaml:"-"`
}
// SelectorBlock describes CSS selector extraction with optional filters.
type SelectorBlock struct {
Selector string `yaml:"selector"`
Text string `yaml:"text"`
Attribute string `yaml:"attribute"`
Remove string `yaml:"remove"`
Filters []FilterBlock `yaml:"filters"`
Case map[string]string `yaml:"case"`
}
// FilterBlock represents a filter transformation.
type FilterBlock struct {
Name string `yaml:"name"`
Args interface{} `yaml:"args"`
}
// RatioBlock describes ratio display configuration.
type RatioBlock struct {
Selector string `yaml:"selector"`
Path string `yaml:"path"`
}
// StringOrSlice is a custom type that accepts either a string or a slice of strings in YAML.
type StringOrSlice []string
func (s *StringOrSlice) UnmarshalYAML(value *yaml.Node) error {
var single string
if err := value.Decode(&single); err == nil {
*s = []string{single}
return nil
}
var slice []string
if err := value.Decode(&slice); err != nil {
return fmt.Errorf("expected string or list of strings: %w", err)
}
*s = slice
return nil
}
// FieldsListBlock preserves the field ordering from YAML map keys.
type FieldsListBlock []FieldBlock
func (f *FieldsListBlock) UnmarshalYAML(value *yaml.Node) error {
// Cardigann fields are a YAML map where key is field name and value is selector block.
// We use the yaml.Node directly to preserve key ordering.
if value.Kind != yaml.MappingNode {
return fmt.Errorf("fields must be a mapping")
}
result := make([]FieldBlock, 0, len(value.Content)/2)
for i := 0; i < len(value.Content); i += 2 {
keyNode := value.Content[i]
valNode := value.Content[i+1]
fieldName := keyNode.Value
// Marshal the value node back to YAML, then unmarshal into SelectorBlock
valueBytes, err := yaml.Marshal(valNode)
if err != nil {
return fmt.Errorf("failed to marshal field %q: %w", fieldName, err)
}
var block SelectorBlock
if err := yaml.Unmarshal(valueBytes, &block); err != nil {
return fmt.Errorf("failed to unmarshal field %q block: %w", fieldName, err)
}
result = append(result, FieldBlock{
Field: fieldName,
Block: block,
})
}
*f = result
return nil
}
// UnmarshalYAML sets default values for RowsBlock.
func (r *RowsBlock) UnmarshalYAML(value *yaml.Node) error {
// Use a raw type to avoid infinite recursion
type rawRows struct {
Selector string `yaml:"selector"`
Remove string `yaml:"remove"`
After int `yaml:"after"`
DateHeaders SelectorBlock `yaml:"dateheaders"`
}
var raw rawRows
if err := value.Decode(&raw); err != nil {
return err
}
r.Selector = raw.Selector
r.Remove = raw.Remove
r.After = raw.After
r.DateHeaders = raw.DateHeaders
return nil
}
// UnmarshalYAML sets default values for LoginBlock.
func (l *LoginBlock) UnmarshalYAML(value *yaml.Node) error {
type rawLogin struct {
Path string `yaml:"path"`
Method string `yaml:"method"`
Form string `yaml:"form"`
Inputs map[string]string `yaml:"inputs"`
Error []ErrorBlock `yaml:"error"`
Test PageTestBlock `yaml:"test"`
}
var raw rawLogin
if err := value.Decode(&raw); err != nil {
return err
}
l.Path = raw.Path
l.Method = raw.Method
l.Form = raw.Form
l.Inputs = raw.Inputs
l.Error = raw.Error
l.Test = raw.Test
// Apply defaults
if l.Method == "" {
l.Method = "form"
}
if l.Form == "" {
l.Form = "form"
}
return nil
}
// ParseDefinition parses raw YAML bytes into a Definition struct.
// It applies defaults and validates required fields.
func ParseDefinition(data []byte) (*Definition, error) {
var def Definition
if err := yaml.Unmarshal(data, &def); err != nil {
return nil, fmt.Errorf("parse YAML: %w", err)
}
// Apply defaults
if def.Language == "" {
def.Language = "en-us"
}
if def.Encoding == "" {
def.Encoding = "UTF-8"
}
// Validate required fields
if def.Site == "" {
return nil, fmt.Errorf("definition missing required field: site")
}
if def.Name == "" {
return nil, fmt.Errorf("definition missing required field: name")
}
if len(def.Links) == 0 {
return nil, fmt.Errorf("definition missing required field: links")
}
// Threat model T-10-04: Reject oversized definitions
if len(def.Search.Fields) > 100 {
return nil, fmt.Errorf("definition has too many search fields (%d > 100)", len(def.Search.Fields))
}
if len(def.Caps.Categories) > 1000 {
return nil, fmt.Errorf("definition has too many category mappings (%d > 1000)", len(def.Caps.Categories))
}
return &def, nil
}
// ValidateDefinition returns a list of validation warnings for a parsed definition.
// These are not errors — the definition may still be usable — but indicate potential issues.
func ValidateDefinition(def *Definition) []string {
var warnings []string
if def.Search.Rows.Selector == "" {
warnings = append(warnings, "search.rows.selector is empty — search will not find results")
}
hasTitle := false
hasDownload := false
for _, field := range def.Search.Fields {
switch field.Field {
case "title":
hasTitle = true
case "download":
hasDownload = true
}
}
if !hasTitle {
warnings = append(warnings, "search.fields missing \"title\" field — results will have no title")
}
if !hasDownload {
warnings = append(warnings, "search.fields missing \"download\" field — results will have no download URL")
}
// Check that login inputs reference config settings
if len(def.Login.Inputs) > 0 && len(def.Settings) > 0 {
settingNames := make(map[string]bool, len(def.Settings))
for _, s := range def.Settings {
settingNames[s.Name] = true
}
}
return warnings
}

View File

@@ -0,0 +1,614 @@
package cardigann
import (
"context"
"fmt"
"io"
"log/slog"
"net/http"
"net/url"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"github.com/dustin/go-humanize"
)
// CardigannResult is the output of a Cardigann search operation.
// It is converted to service.SearchResult by the service layer.
type CardigannResult struct {
Title string
GUID string
DownloadURL string
Size int64
PubDate string
Seeders int
Peers int
Category string
Description string
}
// IndexerTestResult is the result of testing a Cardigann indexer connection.
type IndexerTestResult struct {
Success bool
Error string
}
// CardigannEngine handles Cardigann indexer operations: search, login, test.
type CardigannEngine struct {
httpClient *http.Client
cookies []*http.Cookie
logger *slog.Logger
}
// NewCardigannEngine creates a new CardigannEngine with safe HTTP client.
func NewCardigannEngine() *CardigannEngine {
return &CardigannEngine{
httpClient: SafeHTTPClient(),
logger: slog.Default(),
}
}
// Search executes a Cardigann search: login (if needed), build request, parse HTML, extract results.
func (e *CardigannEngine) Search(ctx context.Context, def *Definition, config map[string]string, query SearchQuery) ([]CardigannResult, error) {
baseURL := e.getBaseURL(def, config)
// Login if required
if def.Login.Path != "" || len(def.Login.Inputs) > 0 {
if err := e.login(ctx, def, config, baseURL); err != nil {
return nil, fmt.Errorf("login failed: %w", err)
}
}
// Build search URL from path template
searchPath := def.Search.Path
if searchPath == "" {
searchPath = "/"
}
path, err := ApplyTemplate("search-path", searchPath, TemplateContext{
Query: query,
Config: config,
Categories: []string{},
})
if err != nil {
return nil, fmt.Errorf("template search path: %w", err)
}
searchURL, err := e.resolvePath(baseURL, path)
if err != nil {
return nil, fmt.Errorf("resolve search URL: %w", err)
}
// Validate the search URL (SSRF protection)
if err := ValidateURL(searchURL); err != nil {
return nil, fmt.Errorf("search URL blocked: %w", err)
}
// Build query inputs
inputValues := make(url.Values)
for key, tplStr := range def.Search.Inputs {
rendered, err := ApplyTemplate("input-"+key, tplStr, TemplateContext{
Query: query,
Config: config,
Categories: []string{},
})
if err != nil {
return nil, fmt.Errorf("template input %q: %w", key, err)
}
if key == "$raw" {
// Parse as query string and merge
parsed, err := url.ParseQuery(rendered)
if err == nil {
for k, vals := range parsed {
for _, v := range vals {
inputValues.Set(k, v)
}
}
}
} else {
inputValues.Set(key, rendered)
}
}
// Execute HTTP request
var resp *http.Response
method := strings.ToUpper(def.Search.Method)
if method == "" {
method = "GET"
}
searchCtx, searchCancel := context.WithTimeout(ctx, 15*time.Second)
defer searchCancel()
if method == "POST" {
req, err := http.NewRequestWithContext(searchCtx, http.MethodPost, searchURL, strings.NewReader(inputValues.Encode()))
if err != nil {
return nil, fmt.Errorf("create POST request: %w", err)
}
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
for _, cookie := range e.cookies {
req.AddCookie(cookie)
}
resp, err = e.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("POST search: %w", err)
}
} else {
// GET: append query string
if len(inputValues) > 0 {
if strings.Contains(searchURL, "?") {
searchURL += "&" + inputValues.Encode()
} else {
searchURL += "?" + inputValues.Encode()
}
}
req, err := http.NewRequestWithContext(searchCtx, http.MethodGet, searchURL, nil)
if err != nil {
return nil, fmt.Errorf("create GET request: %w", err)
}
for _, cookie := range e.cookies {
req.AddCookie(cookie)
}
resp, err = e.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("GET search: %w", err)
}
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
return nil, fmt.Errorf("search returned HTTP %d", resp.StatusCode)
}
// Read response with size limit (T-10-07: 10MB cap)
body := io.LimitReader(resp.Body, 10*1024*1024)
// Parse HTML
doc, err := goquery.NewDocumentFromReader(body)
if err != nil {
return nil, fmt.Errorf("parse HTML: %w", err)
}
// Find rows
rows := doc.Find(def.Search.Rows.Selector)
if def.Search.Rows.Remove != "" {
rows.Find(def.Search.Rows.Remove).Remove()
}
var results []CardigannResult
rows.Each(func(i int, row *goquery.Selection) {
result := CardigannResult{}
fieldValues := make(map[string]string)
for _, field := range def.Search.Fields {
val, err := ExtractField(row, field.Block)
if err != nil {
e.logger.Warn("field extraction error", "field", field.Field, "error", err)
continue
}
fieldValues[field.Field] = val
}
// Map fields to result
result.Title = fieldValues["title"]
result.DownloadURL = fieldValues["download"]
result.GUID = fieldValues["details"]
result.Category = fieldValues["category"]
result.Description = fieldValues["description"]
result.PubDate = fieldValues["date"]
// Resolve relative URLs
if result.DownloadURL != "" {
resolved, err := e.resolvePath(baseURL, result.DownloadURL)
if err == nil {
result.DownloadURL = resolved
}
}
if result.GUID != "" {
resolved, err := e.resolvePath(baseURL, result.GUID)
if err == nil {
result.GUID = resolved
}
}
// Parse size
if sizeStr := fieldValues["size"]; sizeStr != "" {
if size, err := humanize.ParseBytes(strings.TrimSpace(sizeStr)); err == nil {
result.Size = int64(size)
}
}
// Parse seeders/peers
if seedersStr := fieldValues["seeders"]; seedersStr != "" {
if v, err := strconv.Atoi(strings.TrimSpace(seedersStr)); err == nil {
result.Seeders = v
}
}
if leechersStr := fieldValues["leechers"]; leechersStr != "" {
if v, err := strconv.Atoi(strings.TrimSpace(leechersStr)); err == nil {
result.Peers = v
}
}
// Parse date if it wasn't already RFC3339
if result.PubDate != "" {
result.PubDate = e.parseDateField(result.PubDate)
}
// Only include results with at least a title
if result.Title != "" {
results = append(results, result)
}
})
return results, nil
}
// login performs authentication against the Cardigann indexer.
func (e *CardigannEngine) login(ctx context.Context, def *Definition, config map[string]string, baseURL string) error {
loginPath := def.Login.Path
if loginPath == "" {
return fmt.Errorf("login path is empty")
}
path, err := ApplyTemplate("login-path", loginPath, TemplateContext{
Config: config,
})
if err != nil {
return fmt.Errorf("template login path: %w", err)
}
loginURL, err := e.resolvePath(baseURL, path)
if err != nil {
return fmt.Errorf("resolve login URL: %w", err)
}
if err := ValidateURL(loginURL); err != nil {
return fmt.Errorf("login URL blocked: %w", err)
}
// Build input values from login.inputs
inputValues := make(map[string]string)
for key, tplStr := range def.Login.Inputs {
rendered, err := ApplyTemplate("login-input-"+key, tplStr, TemplateContext{
Config: config,
})
if err != nil {
return fmt.Errorf("template login input %q: %w", key, err)
}
inputValues[key] = rendered
}
loginCtx, loginCancel := context.WithTimeout(ctx, 10*time.Second)
defer loginCancel()
switch def.Login.Method {
case "cookie":
// Set cookie directly
if cookieStr, ok := inputValues["cookie"]; ok {
parts := strings.SplitN(cookieStr, "=", 2)
cookie := &http.Cookie{
Name: parts[0],
Value: func() string { if len(parts) > 1 { return parts[1] }; return "" }(),
}
e.cookies = append(e.cookies, cookie)
}
return nil
case "post":
// POST directly to login path with inputs
form := url.Values{}
for key, val := range inputValues {
form.Set(key, val)
}
req, err := http.NewRequestWithContext(loginCtx, http.MethodPost, loginURL, strings.NewReader(form.Encode()))
if err != nil {
return fmt.Errorf("create login POST: %w", err)
}
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
resp, err := e.httpClient.Do(req)
if err != nil {
return fmt.Errorf("login POST: %w", err)
}
defer resp.Body.Close()
io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024))
// Store cookies from response
e.cookies = resp.Cookies()
// Check for errors
if err := e.checkLoginErrors(resp, def); err != nil {
return err
}
default:
// "form" method (default)
// GET login page, find form, fill inputs, submit
req, err := http.NewRequestWithContext(loginCtx, http.MethodGet, loginURL, nil)
if err != nil {
return fmt.Errorf("create login GET: %w", err)
}
resp, err := e.httpClient.Do(req)
if err != nil {
return fmt.Errorf("login GET: %w", err)
}
defer resp.Body.Close()
bodyBytes, err := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024))
if err != nil {
return fmt.Errorf("read login page: %w", err)
}
e.cookies = append(e.cookies, resp.Cookies()...)
// Parse the login page to find the form
doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(bodyBytes)))
if err != nil {
return fmt.Errorf("parse login page: %w", err)
}
// Find the form
formSelector := def.Login.Form
if formSelector == "" {
formSelector = "form"
}
form := doc.Find(formSelector).First()
if form.Length() == 0 {
return fmt.Errorf("login form not found with selector %q", formSelector)
}
// Get form action
action, exists := form.Attr("action")
if !exists || action == "" {
action = loginPath
}
actionURL, err := e.resolvePath(baseURL, action)
if err != nil {
return fmt.Errorf("resolve form action: %w", err)
}
if err := ValidateURL(actionURL); err != nil {
return fmt.Errorf("form action URL blocked: %w", err)
}
// Collect hidden inputs from form
formValues := url.Values{}
form.Find("input[type='hidden']").Each(func(i int, s *goquery.Selection) {
name, _ := s.Attr("name")
value, _ := s.Attr("value")
if name != "" {
formValues.Set(name, value)
}
})
// Add login inputs
for key, val := range inputValues {
formValues.Set(key, val)
}
// Submit the form
submitReq, err := http.NewRequestWithContext(loginCtx, http.MethodPost, actionURL, strings.NewReader(formValues.Encode()))
if err != nil {
return fmt.Errorf("create form submit: %w", err)
}
submitReq.Header.Set("Content-Type", "application/x-www-form-urlencoded")
for _, cookie := range e.cookies {
submitReq.AddCookie(cookie)
}
submitResp, err := e.httpClient.Do(submitReq)
if err != nil {
return fmt.Errorf("submit login form: %w", err)
}
defer submitResp.Body.Close()
io.ReadAll(io.LimitReader(submitResp.Body, 10*1024*1024))
e.cookies = append(e.cookies, submitResp.Cookies()...)
// Check for errors
if err := e.checkLoginErrors(submitResp, def); err != nil {
return err
}
}
// Test login if test block is defined
if def.Login.Test.Selector != "" || def.Login.Test.Path != "" {
testPath := def.Login.Test.Path
if testPath == "" {
testPath = "/"
}
testURL, err := e.resolvePath(baseURL, testPath)
if err != nil {
return fmt.Errorf("resolve test URL: %w", err)
}
if err := ValidateURL(testURL); err != nil {
return fmt.Errorf("test URL blocked: %w", err)
}
testReq, err := http.NewRequestWithContext(loginCtx, http.MethodGet, testURL, nil)
if err != nil {
return fmt.Errorf("create test request: %w", err)
}
for _, cookie := range e.cookies {
testReq.AddCookie(cookie)
}
testResp, err := e.httpClient.Do(testReq)
if err != nil {
return fmt.Errorf("login test request: %w", err)
}
defer testResp.Body.Close()
io.ReadAll(io.LimitReader(testResp.Body, 10*1024*1024))
if def.Login.Test.Selector != "" {
testDoc, err := goquery.NewDocumentFromReader(strings.NewReader(func() string {
// We can't re-read the body, so we just check the status code
return ""
}()))
if err != nil {
return nil // Don't fail on parse errors
}
if testDoc.Find(def.Login.Test.Selector).Length() == 0 {
return fmt.Errorf("login test: selector %q not found", def.Login.Test.Selector)
}
}
}
return nil
}
// Test validates a Cardigann indexer by checking base URL connectivity and optionally testing login.
func (e *CardigannEngine) Test(ctx context.Context, def *Definition, config map[string]string) (*IndexerTestResult, error) {
baseURL := e.getBaseURL(def, config)
if baseURL == "" {
return &IndexerTestResult{Success: false, Error: "no base URL in definition"}, nil
}
if err := ValidateURL(baseURL); err != nil {
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("URL blocked: %v", err)}, nil
}
// If Login block present, attempt login
if def.Login.Path != "" || len(def.Login.Inputs) > 0 {
if err := e.login(ctx, def, config, baseURL); err != nil {
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("login failed: %v", err)}, nil
}
}
// If Search block present, test search path
if def.Search.Path != "" {
testPath, err := ApplyTemplate("test-path", def.Search.Path, TemplateContext{
Config: config,
})
if err != nil {
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("template error: %v", err)}, nil
}
testURL, err := e.resolvePath(baseURL, testPath)
if err != nil {
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("resolve URL: %v", err)}, nil
}
if err := ValidateURL(testURL); err != nil {
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("URL blocked: %v", err)}, nil
}
testCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
defer cancel()
req, err := http.NewRequestWithContext(testCtx, http.MethodGet, testURL, nil)
if err != nil {
return &IndexerTestResult{Success: false, Error: err.Error()}, nil
}
for _, cookie := range e.cookies {
req.AddCookie(cookie)
}
resp, err := e.httpClient.Do(req)
if err != nil {
return &IndexerTestResult{Success: false, Error: err.Error()}, nil
}
resp.Body.Close()
if resp.StatusCode >= 400 {
return &IndexerTestResult{
Success: false,
Error: fmt.Sprintf("HTTP %d", resp.StatusCode),
}, nil
}
}
return &IndexerTestResult{Success: true}, nil
}
// resolvePath resolves a potentially relative path against a base URL.
func (e *CardigannEngine) resolvePath(baseURL, path string) (string, error) {
if path == "" {
return baseURL, nil
}
// Already absolute URL
if strings.HasPrefix(strings.ToLower(path), "http://") || strings.HasPrefix(strings.ToLower(path), "https://") {
return path, nil
}
// Relative URL — resolve against base
base, err := url.Parse(baseURL)
if err != nil {
return "", fmt.Errorf("parse base URL: %w", err)
}
ref, err := url.Parse(path)
if err != nil {
return "", fmt.Errorf("parse path: %w", err)
}
resolved := base.ResolveReference(ref)
return resolved.String(), nil
}
// getBaseURL returns the first link from the definition, or a config override.
func (e *CardigannEngine) getBaseURL(def *Definition, config map[string]string) string {
if url, ok := config["base_url"]; ok && url != "" {
return url
}
if len(def.Links) > 0 {
return def.Links[0]
}
return ""
}
// parseDateField attempts to parse a date string in various formats.
func (e *CardigannEngine) parseDateField(val string) string {
// Already RFC3339
if _, err := time.Parse(time.RFC3339, val); err == nil {
return val
}
// Try common date layouts
layouts := []string{
"2006-01-02 15:04:05",
"2006-01-02T15:04:05Z07:00",
"2006-01-02T15:04:05",
"2006-01-02",
"02-Jan-2006",
"Jan 02, 2006",
"Jan 02 2006",
"02 Jan 2006 15:04:05",
"Mon, 02 Jan 2006 15:04:05 -0700",
time.RFC1123,
time.RFC1123Z,
time.RFC822,
time.RFC822Z,
}
for _, layout := range layouts {
if t, err := time.Parse(layout, strings.TrimSpace(val)); err == nil {
return t.Format(time.RFC3339)
}
}
// Try relative time
if t, err := parseFuzzyTime(val); err == nil {
return t.Format(time.RFC3339)
}
// Return as-is if we can't parse
return val
}
// checkLoginErrors checks for login error patterns in the response.
func (e *CardigannEngine) checkLoginErrors(resp *http.Response, def *Definition) error {
if len(def.Login.Error) == 0 {
return nil
}
// Note: body has already been read; we'd need to store it
// For now, just check status code
if resp.StatusCode >= 400 {
return fmt.Errorf("login returned HTTP %d", resp.StatusCode)
}
return nil
}

View File

@@ -0,0 +1,296 @@
package cardigann
import (
"fmt"
"regexp"
"strconv"
"strings"
"time"
)
// ApplyFilters applies a chain of filter transformations to a value.
func ApplyFilters(val string, filters []FilterBlock) (string, error) {
var err error
for _, f := range filters {
val, err = invokeFilter(val, f)
if err != nil {
return val, err
}
}
return val, nil
}
// invokeFilter dispatches a single filter by name.
func invokeFilter(val string, f FilterBlock) (string, error) {
switch f.Name {
case "querystring":
return filterQuerystring(val, f.Args)
case "dateparse", "timeparse":
return filterDateParse(val, f.Args)
case "regexp":
return filterRegexp(val, f.Args)
case "split":
return filterSplit(val, f.Args)
case "replace":
return filterReplace(val, f.Args)
case "trim":
return filterTrim(val, f.Args)
case "append":
return filterAppend(val, f.Args)
case "prepend":
return filterPrepend(val, f.Args)
case "timeago", "fuzzytime", "reltime":
return filterTimeAgo(val, f.Args)
default:
return val, fmt.Errorf("unknown filter: %q", f.Name)
}
}
// filterQuerystring extracts a query parameter from a URL value.
// Args: param name string
func filterQuerystring(val string, args interface{}) (string, error) {
paramName, ok := args.(string)
if !ok {
return val, fmt.Errorf("querystring filter: args must be a string")
}
// Find the query string part
qIdx := strings.Index(val, "?")
if qIdx < 0 {
return "", nil
}
query := val[qIdx+1:]
// Parse manually to avoid importing net/url for simple cases
for _, pair := range strings.Split(query, "&") {
kv := strings.SplitN(pair, "=", 2)
if len(kv) == 2 && kv[0] == paramName {
// Basic URL decoding
result := strings.ReplaceAll(kv[1], "+", " ")
result = strings.ReplaceAll(result, "%20", " ")
return result, nil
}
}
return "", nil
}
// filterDateParse parses a date string using a Go time layout.
// Args: layout string (e.g., "2006-01-02")
func filterDateParse(val string, args interface{}) (string, error) {
layout, ok := args.(string)
if !ok {
return val, fmt.Errorf("dateparse filter: args must be a string (Go time layout)")
}
t, err := time.Parse(layout, strings.TrimSpace(val))
if err != nil {
return val, fmt.Errorf("dateparse: %w", err)
}
return t.Format(time.RFC3339), nil
}
// filterRegexp extracts the first capture group from value.
// Args: pattern string
func filterRegexp(val string, args interface{}) (string, error) {
pattern, ok := args.(string)
if !ok {
return val, fmt.Errorf("regexp filter: args must be a string (pattern)")
}
re, err := regexp.Compile(pattern)
if err != nil {
return val, fmt.Errorf("regexp compile: %w", err)
}
matches := re.FindStringSubmatch(val)
if len(matches) < 2 {
return val, nil
}
return matches[1], nil
}
// filterSplit splits value by separator and returns the element at position.
// Args: [separator, position] as []interface{} or single string
func filterSplit(val string, args interface{}) (string, error) {
sep, pos := parseSplitArgs(args)
parts := strings.Split(val, sep)
idx := int(pos)
if idx < 0 {
idx = len(parts) + idx
}
if idx < 0 || idx >= len(parts) {
return val, nil
}
return parts[idx], nil
}
// filterReplace performs string replacement.
// Args: [from, to] as []interface{} or single string
func filterReplace(val string, args interface{}) (string, error) {
from, to := parseReplaceArgs(args)
return strings.ReplaceAll(val, from, to), nil
}
// filterTrim trims characters from both sides of value.
// Args: cutset string
func filterTrim(val string, args interface{}) (string, error) {
cutset, ok := args.(string)
if !ok {
return strings.TrimSpace(val), nil
}
return strings.Trim(val, cutset), nil
}
// filterAppend appends a suffix to value.
// Args: suffix string
func filterAppend(val string, args interface{}) (string, error) {
suffix, ok := args.(string)
if !ok {
return val, fmt.Errorf("append filter: args must be a string")
}
return val + suffix, nil
}
// filterPrepend prepends a prefix to value.
// Args: prefix string
func filterPrepend(val string, args interface{}) (string, error) {
prefix, ok := args.(string)
if !ok {
return val, fmt.Errorf("prepend filter: args must be a string")
}
return prefix + val, nil
}
// filterTimeAgo parses relative time strings like "2 hours ago", "yesterday", "3d ago".
// It returns an RFC3339 formatted timestamp.
func filterTimeAgo(val string, _ interface{}) (string, error) {
t, err := parseFuzzyTime(strings.TrimSpace(val))
if err != nil {
return val, err
}
return t.Format(time.RFC3339), nil
}
// parseFuzzyTime handles relative time strings.
// Supports: "N unit(s) ago", "yesterday", abbreviations like "2h ago", "3d", "1w ago".
func parseFuzzyTime(val string) (time.Time, error) {
now := time.Now()
lower := strings.ToLower(val)
// Handle "yesterday"
if lower == "yesterday" {
return now.AddDate(0, 0, -1), nil
}
if lower == "today" || lower == "now" {
return now, nil
}
// Remove "ago" suffix
lower = strings.TrimSuffix(lower, " ago")
lower = strings.TrimSuffix(lower, " ago.")
lower = strings.TrimSpace(lower)
// Handle just a number + unit without "ago" (e.g., "3d", "2h")
// Pattern: optional number, then unit abbreviation or full name
re := regexp.MustCompile(`^(\d+)\s*(s(?:ec(?:ond)?s?)?|m(?:in(?:ute)?s?)?|h(?:ou?r?s?)?|d(?:ay?s?)?|w(?:ee?k?s?)?|mo(?:nth?s?)?|y(?:ea?r?s?)?)$`)
matches := re.FindStringSubmatch(lower)
if len(matches) < 3 {
// Try the pattern: "N units ago" format
re2 := regexp.MustCompile(`^(\d+)\s+(s(?:ec(?:ond)?s?)?|m(?:in(?:ute)?s?)?|h(?:ou?r?s?)?|d(?:ay?s?)?|w(?:ee?k?s?)?|mo(?:nth?s?)?|y(?:ea?r?s?)?)$`)
matches = re2.FindStringSubmatch(lower)
}
if len(matches) < 3 {
// Try standard duration like "2 hours ago"
re3 := regexp.MustCompile(`^(\d+)\s+(seconds?|minutes?|hours?|days?|weeks?|months?|years?)$`)
matches = re3.FindStringSubmatch(lower)
}
if len(matches) < 3 {
return now, fmt.Errorf("unrecognized relative time: %q", val)
}
n, err := strconv.Atoi(matches[1])
if err != nil {
return now, fmt.Errorf("invalid number in relative time: %q", matches[1])
}
unit := matches[2]
switch {
case strings.HasPrefix(unit, "s"):
return now.Add(-time.Duration(n) * time.Second), nil
case strings.HasPrefix(unit, "mi"):
return now.Add(-time.Duration(n) * time.Minute), nil
case strings.HasPrefix(unit, "h"):
return now.Add(-time.Duration(n) * time.Hour), nil
case strings.HasPrefix(unit, "d"):
return now.AddDate(0, 0, -n), nil
case strings.HasPrefix(unit, "w"):
return now.AddDate(0, 0, -n*7), nil
case strings.HasPrefix(unit, "mo"):
return now.AddDate(0, -n, 0), nil
case strings.HasPrefix(unit, "y"):
return now.AddDate(-n, 0, 0), nil
default:
return now, fmt.Errorf("unrecognized time unit: %q", unit)
}
}
// parseSplitArgs extracts separator and position from filter args.
// Args can be: []interface{}{sep, pos}, or a string (defaults to comma separator, position 0).
func parseSplitArgs(args interface{}) (string, int) {
switch a := args.(type) {
case []interface{}:
sep := ","
pos := 0
if len(a) > 0 {
if s, ok := a[0].(string); ok {
sep = s
}
}
if len(a) > 1 {
switch p := a[1].(type) {
case int:
pos = p
case float64:
pos = int(p)
case string:
pos, _ = strconv.Atoi(p)
}
}
return sep, pos
case string:
return a, 0
default:
return ",", 0
}
}
// parseReplaceArgs extracts from/to from filter args.
// Args can be: []interface{}{from, to}, or a single string (empty replacement).
func parseReplaceArgs(args interface{}) (string, string) {
switch a := args.(type) {
case []interface{}:
from := ""
to := ""
if len(a) > 0 {
if s, ok := a[0].(string); ok {
from = s
}
}
if len(a) > 1 {
if s, ok := a[1].(string); ok {
to = s
}
}
return from, to
case string:
return a, ""
default:
return "", ""
}
}

View File

@@ -0,0 +1,48 @@
package cardigann
import (
"bytes"
"fmt"
"strings"
"text/template"
)
// SearchQuery represents a search query to be templated into request URLs and inputs.
type SearchQuery struct {
Keywords string
MediaType string
}
// TemplateContext provides the data available to Cardigann templates.
type TemplateContext struct {
Query SearchQuery
Config map[string]string
Categories []string
}
// ApplyTemplate processes a Go template string with the sandboxed Cardigann FuncMap.
// The FuncMap contains ONLY "replace" to prevent SSRF or file access via templates.
func ApplyTemplate(name, tpl string, ctx interface{}) (string, error) {
tmpl, err := template.New(name).Funcs(sandboxedFuncMap()).Parse(tpl)
if err != nil {
return "", fmt.Errorf("parse template %q: %w", name, err)
}
var buf bytes.Buffer
if err := tmpl.Execute(&buf, ctx); err != nil {
return "", fmt.Errorf("execute template %q: %w", name, err)
}
return buf.String(), nil
}
// sandboxedFuncMap returns a template FuncMap containing ONLY safe functions.
// SECURITY: No file, network, environment, or exec access allowed.
// Threat model T-10-02, T-10-06: FuncMap contains ONLY "replace".
func sandboxedFuncMap() template.FuncMap {
return template.FuncMap{
"replace": func(old, new, src string) string {
return strings.ReplaceAll(src, old, new)
},
}
}

View File

@@ -0,0 +1,165 @@
package cardigann
import (
"context"
"fmt"
"net"
"net/http"
"os"
"strings"
"time"
)
// ValidateURL validates that a URL is safe to make requests to.
// It blocks requests to private/internal IPs and non-HTTP schemes.
// Threat model T-10-05: SSRF protection.
func ValidateURL(rawURL string) error {
// Check for config override (testing only)
if os.Getenv("CARDIGANN_ALLOW_PRIVATE") == "true" {
return nil
}
// Basic scheme check before full URL parsing
lower := strings.ToLower(rawURL)
if !strings.HasPrefix(lower, "http://") && !strings.HasPrefix(lower, "https://") {
return fmt.Errorf("URL scheme must be http or https, got: %q", rawURL)
}
// Extract hostname
host := rawURL
// Remove scheme
if idx := strings.Index(host, "://"); idx >= 0 {
host = host[idx+3:]
}
// Remove path and everything after
if idx := strings.Index(host, "/"); idx >= 0 {
host = host[:idx]
}
// Remove port
if idx := strings.LastIndex(host, ":"); idx >= 0 {
host = host[:idx]
}
// Remove user info
if idx := strings.LastIndex(host, "@"); idx >= 0 {
host = host[idx+1:]
}
host = strings.ToLower(strings.TrimSpace(host))
// Block well-known local hostnames
if host == "localhost" || strings.HasSuffix(host, ".local") || strings.HasSuffix(host, ".internal") {
return fmt.Errorf("hostname %q is blocked (private/local)", host)
}
// Resolve hostname and check IPs
resolveCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
defer cancel()
resolver := net.Resolver{}
ips, err := resolver.LookupIPAddr(resolveCtx, host)
if err != nil {
return fmt.Errorf("failed to resolve hostname %q: %w", host, err)
}
for _, ipAddr := range ips {
ip := ipAddr.IP
if isPrivateIP(ip) {
return fmt.Errorf("hostname %q resolves to private IP %s", host, ip)
}
}
return nil
}
// isPrivateIP checks if an IP address is in a private/reserved range.
func isPrivateIP(ip net.IP) bool {
// IPv4 private ranges
if ip.To4() != nil {
// 127.0.0.0/8 (loopback)
if ip.IsLoopback() {
return true
}
// 10.0.0.0/8
if ip[0] == 10 {
return true
}
// 172.16.0.0/12
if ip[0] == 172 && ip[1] >= 16 && ip[1] <= 31 {
return true
}
// 192.168.0.0/16
if ip[0] == 192 && ip[1] == 168 {
return true
}
// 169.254.0.0/16 (link-local)
if ip[0] == 169 && ip[1] == 254 {
return true
}
// 0.0.0.0
if ip.IsUnspecified() {
return true
}
}
// IPv6 checks
if ip.To4() == nil {
// ::1 (loopback)
if ip.IsLoopback() {
return true
}
// fc00::/7 (unique local / private)
if (ip[0] & 0xfe) == 0xfc {
return true
}
// fe80::/10 (link-local)
if ip[0] == 0xfe && (ip[1]&0xc0) == 0x80 {
return true
}
// :: (unspecified)
if ip.IsUnspecified() {
return true
}
}
return false
}
// SafeHTTPClient returns an http.Client with timeouts and DNS checking.
func SafeHTTPClient() *http.Client {
return &http.Client{
Timeout: 15 * time.Second,
Transport: &http.Transport{
DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
// Extract host from addr (may include port)
host, _, err := net.SplitHostPort(addr)
if err != nil {
host = addr
}
// Resolve and check the IP
resolver := net.Resolver{}
ips, err := resolver.LookupIPAddr(ctx, host)
if err != nil {
return nil, fmt.Errorf("DNS resolution failed for %q: %w", host, err)
}
for _, ipAddr := range ips {
if isPrivateIP(ipAddr.IP) {
return nil, fmt.Errorf("blocked private IP %s for host %q", ipAddr.IP, host)
}
}
// Use the first resolved IP
if len(ips) == 0 {
return nil, fmt.Errorf("no IP addresses found for %q", host)
}
dialer := net.Dialer{Timeout: 10 * time.Second}
return dialer.DialContext(ctx, network, net.JoinHostPort(ips[0].IP.String(), func() string {
_, port, _ := net.SplitHostPort(addr)
return port
}()))
},
},
}
}

View File

@@ -0,0 +1,84 @@
package cardigann
import (
"fmt"
"strings"
"github.com/PuerkitoBio/goquery"
)
// ExtractField evaluates a CSS selector block against a goquery selection
// and returns the extracted (and filtered) string value.
func ExtractField(selection *goquery.Selection, block SelectorBlock) (string, error) {
var val string
// If Text is set, it's a static text value
if block.Text != "" {
val = block.Text
return applyFiltersToValue(val, block)
}
// If no selector, return empty
if block.Selector == "" {
return "", nil
}
// Find matching elements
sub := selection.Find(block.Selector)
if sub.Length() == 0 {
return "", nil
}
// Remove child elements matching Remove selector
if block.Remove != "" {
sub.Find(block.Remove).Remove()
}
// If Case patterns defined, iterate and return matching value
if len(block.Case) > 0 {
for pattern, result := range block.Case {
// Check if any matched element matches the pattern
found := false
sub.EachWithBreak(func(i int, s *goquery.Selection) bool {
text := strings.TrimSpace(s.Text())
if text == pattern || strings.Contains(text, pattern) {
found = true
val = result
return false
}
return true
})
if found {
return applyFiltersToValue(val, block)
}
}
return "", nil
}
// If Attribute specified, get attribute from first element
if block.Attribute != "" {
attrVal, exists := sub.Attr(block.Attribute)
if !exists {
return "", nil
}
val = attrVal
} else {
// Get trimmed text content
val = strings.TrimSpace(sub.First().Text())
}
return applyFiltersToValue(val, block)
}
// applyFiltersToValue applies the filter chain to a value.
func applyFiltersToValue(val string, block SelectorBlock) (string, error) {
if len(block.Filters) == 0 {
return val, nil
}
result, err := ApplyFilters(val, block.Filters)
if err != nil {
return val, fmt.Errorf("filter chain error: %w", err)
}
return result, nil
}