Sync from /srv/compose/unified-media-manager
This commit is contained in:
287
internal/cardigann/definition.go
Normal file
287
internal/cardigann/definition.go
Normal file
@@ -0,0 +1,287 @@
|
||||
package cardigann
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
yaml "gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// Definition represents a parsed Cardigann YAML indexer definition.
|
||||
// It matches the upstream Cardigann schema for site definitions.
|
||||
type Definition struct {
|
||||
Site string `yaml:"site"`
|
||||
Name string `yaml:"name"`
|
||||
Description string `yaml:"description"`
|
||||
Language string `yaml:"language"`
|
||||
Encoding string `yaml:"encoding"`
|
||||
Links StringOrSlice `yaml:"links"`
|
||||
Settings []SettingsField `yaml:"settings"`
|
||||
Caps CapabilitiesBlock `yaml:"caps"`
|
||||
Login LoginBlock `yaml:"login"`
|
||||
Ratio RatioBlock `yaml:"ratio"`
|
||||
Search SearchBlock `yaml:"search"`
|
||||
}
|
||||
|
||||
// SettingsField describes a user-configurable field in the definition.
|
||||
type SettingsField struct {
|
||||
Name string `yaml:"name"`
|
||||
Type string `yaml:"type"`
|
||||
Label string `yaml:"label"`
|
||||
}
|
||||
|
||||
// CapabilitiesBlock maps categories and search modes.
|
||||
type CapabilitiesBlock struct {
|
||||
Categories map[string]string `yaml:"categories"`
|
||||
Modes map[string][]string `yaml:"modes"`
|
||||
}
|
||||
|
||||
// LoginBlock describes authentication configuration.
|
||||
type LoginBlock struct {
|
||||
Path string `yaml:"path"`
|
||||
Method string `yaml:"method"`
|
||||
Form string `yaml:"form"`
|
||||
Inputs map[string]string `yaml:"inputs"`
|
||||
Error []ErrorBlock `yaml:"error"`
|
||||
Test PageTestBlock `yaml:"test"`
|
||||
}
|
||||
|
||||
// ErrorBlock describes an error detection pattern.
|
||||
type ErrorBlock struct {
|
||||
Path string `yaml:"path"`
|
||||
Selector string `yaml:"selector"`
|
||||
Message SelectorBlock `yaml:"message"`
|
||||
}
|
||||
|
||||
// PageTestBlock describes a page test for verifying login.
|
||||
type PageTestBlock struct {
|
||||
Path string `yaml:"path"`
|
||||
Selector string `yaml:"selector"`
|
||||
}
|
||||
|
||||
// SearchBlock describes search configuration.
|
||||
type SearchBlock struct {
|
||||
Path string `yaml:"path"`
|
||||
Method string `yaml:"method"`
|
||||
Inputs map[string]string `yaml:"inputs"`
|
||||
Rows RowsBlock `yaml:"rows"`
|
||||
Fields FieldsListBlock `yaml:"fields"`
|
||||
}
|
||||
|
||||
// RowsBlock describes how to find result rows in HTML.
|
||||
type RowsBlock struct {
|
||||
Selector string `yaml:"selector"`
|
||||
Remove string `yaml:"remove"`
|
||||
After int `yaml:"after"`
|
||||
DateHeaders SelectorBlock `yaml:"dateheaders"`
|
||||
}
|
||||
|
||||
// FieldBlock represents a single field extraction definition.
|
||||
type FieldBlock struct {
|
||||
Field string `yaml:"field"`
|
||||
Block SelectorBlock `yaml:"-"`
|
||||
}
|
||||
|
||||
// SelectorBlock describes CSS selector extraction with optional filters.
|
||||
type SelectorBlock struct {
|
||||
Selector string `yaml:"selector"`
|
||||
Text string `yaml:"text"`
|
||||
Attribute string `yaml:"attribute"`
|
||||
Remove string `yaml:"remove"`
|
||||
Filters []FilterBlock `yaml:"filters"`
|
||||
Case map[string]string `yaml:"case"`
|
||||
}
|
||||
|
||||
// FilterBlock represents a filter transformation.
|
||||
type FilterBlock struct {
|
||||
Name string `yaml:"name"`
|
||||
Args interface{} `yaml:"args"`
|
||||
}
|
||||
|
||||
// RatioBlock describes ratio display configuration.
|
||||
type RatioBlock struct {
|
||||
Selector string `yaml:"selector"`
|
||||
Path string `yaml:"path"`
|
||||
}
|
||||
|
||||
// StringOrSlice is a custom type that accepts either a string or a slice of strings in YAML.
|
||||
type StringOrSlice []string
|
||||
|
||||
func (s *StringOrSlice) UnmarshalYAML(value *yaml.Node) error {
|
||||
var single string
|
||||
if err := value.Decode(&single); err == nil {
|
||||
*s = []string{single}
|
||||
return nil
|
||||
}
|
||||
var slice []string
|
||||
if err := value.Decode(&slice); err != nil {
|
||||
return fmt.Errorf("expected string or list of strings: %w", err)
|
||||
}
|
||||
*s = slice
|
||||
return nil
|
||||
}
|
||||
|
||||
// FieldsListBlock preserves the field ordering from YAML map keys.
|
||||
type FieldsListBlock []FieldBlock
|
||||
|
||||
func (f *FieldsListBlock) UnmarshalYAML(value *yaml.Node) error {
|
||||
// Cardigann fields are a YAML map where key is field name and value is selector block.
|
||||
// We use the yaml.Node directly to preserve key ordering.
|
||||
if value.Kind != yaml.MappingNode {
|
||||
return fmt.Errorf("fields must be a mapping")
|
||||
}
|
||||
|
||||
result := make([]FieldBlock, 0, len(value.Content)/2)
|
||||
for i := 0; i < len(value.Content); i += 2 {
|
||||
keyNode := value.Content[i]
|
||||
valNode := value.Content[i+1]
|
||||
|
||||
fieldName := keyNode.Value
|
||||
|
||||
// Marshal the value node back to YAML, then unmarshal into SelectorBlock
|
||||
valueBytes, err := yaml.Marshal(valNode)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal field %q: %w", fieldName, err)
|
||||
}
|
||||
|
||||
var block SelectorBlock
|
||||
if err := yaml.Unmarshal(valueBytes, &block); err != nil {
|
||||
return fmt.Errorf("failed to unmarshal field %q block: %w", fieldName, err)
|
||||
}
|
||||
|
||||
result = append(result, FieldBlock{
|
||||
Field: fieldName,
|
||||
Block: block,
|
||||
})
|
||||
}
|
||||
|
||||
*f = result
|
||||
return nil
|
||||
}
|
||||
|
||||
// UnmarshalYAML sets default values for RowsBlock.
|
||||
func (r *RowsBlock) UnmarshalYAML(value *yaml.Node) error {
|
||||
// Use a raw type to avoid infinite recursion
|
||||
type rawRows struct {
|
||||
Selector string `yaml:"selector"`
|
||||
Remove string `yaml:"remove"`
|
||||
After int `yaml:"after"`
|
||||
DateHeaders SelectorBlock `yaml:"dateheaders"`
|
||||
}
|
||||
var raw rawRows
|
||||
if err := value.Decode(&raw); err != nil {
|
||||
return err
|
||||
}
|
||||
r.Selector = raw.Selector
|
||||
r.Remove = raw.Remove
|
||||
r.After = raw.After
|
||||
r.DateHeaders = raw.DateHeaders
|
||||
return nil
|
||||
}
|
||||
|
||||
// UnmarshalYAML sets default values for LoginBlock.
|
||||
func (l *LoginBlock) UnmarshalYAML(value *yaml.Node) error {
|
||||
type rawLogin struct {
|
||||
Path string `yaml:"path"`
|
||||
Method string `yaml:"method"`
|
||||
Form string `yaml:"form"`
|
||||
Inputs map[string]string `yaml:"inputs"`
|
||||
Error []ErrorBlock `yaml:"error"`
|
||||
Test PageTestBlock `yaml:"test"`
|
||||
}
|
||||
var raw rawLogin
|
||||
if err := value.Decode(&raw); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
l.Path = raw.Path
|
||||
l.Method = raw.Method
|
||||
l.Form = raw.Form
|
||||
l.Inputs = raw.Inputs
|
||||
l.Error = raw.Error
|
||||
l.Test = raw.Test
|
||||
|
||||
// Apply defaults
|
||||
if l.Method == "" {
|
||||
l.Method = "form"
|
||||
}
|
||||
if l.Form == "" {
|
||||
l.Form = "form"
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ParseDefinition parses raw YAML bytes into a Definition struct.
|
||||
// It applies defaults and validates required fields.
|
||||
func ParseDefinition(data []byte) (*Definition, error) {
|
||||
var def Definition
|
||||
if err := yaml.Unmarshal(data, &def); err != nil {
|
||||
return nil, fmt.Errorf("parse YAML: %w", err)
|
||||
}
|
||||
|
||||
// Apply defaults
|
||||
if def.Language == "" {
|
||||
def.Language = "en-us"
|
||||
}
|
||||
if def.Encoding == "" {
|
||||
def.Encoding = "UTF-8"
|
||||
}
|
||||
|
||||
// Validate required fields
|
||||
if def.Site == "" {
|
||||
return nil, fmt.Errorf("definition missing required field: site")
|
||||
}
|
||||
if def.Name == "" {
|
||||
return nil, fmt.Errorf("definition missing required field: name")
|
||||
}
|
||||
if len(def.Links) == 0 {
|
||||
return nil, fmt.Errorf("definition missing required field: links")
|
||||
}
|
||||
|
||||
// Threat model T-10-04: Reject oversized definitions
|
||||
if len(def.Search.Fields) > 100 {
|
||||
return nil, fmt.Errorf("definition has too many search fields (%d > 100)", len(def.Search.Fields))
|
||||
}
|
||||
if len(def.Caps.Categories) > 1000 {
|
||||
return nil, fmt.Errorf("definition has too many category mappings (%d > 1000)", len(def.Caps.Categories))
|
||||
}
|
||||
|
||||
return &def, nil
|
||||
}
|
||||
|
||||
// ValidateDefinition returns a list of validation warnings for a parsed definition.
|
||||
// These are not errors — the definition may still be usable — but indicate potential issues.
|
||||
func ValidateDefinition(def *Definition) []string {
|
||||
var warnings []string
|
||||
|
||||
if def.Search.Rows.Selector == "" {
|
||||
warnings = append(warnings, "search.rows.selector is empty — search will not find results")
|
||||
}
|
||||
|
||||
hasTitle := false
|
||||
hasDownload := false
|
||||
for _, field := range def.Search.Fields {
|
||||
switch field.Field {
|
||||
case "title":
|
||||
hasTitle = true
|
||||
case "download":
|
||||
hasDownload = true
|
||||
}
|
||||
}
|
||||
|
||||
if !hasTitle {
|
||||
warnings = append(warnings, "search.fields missing \"title\" field — results will have no title")
|
||||
}
|
||||
if !hasDownload {
|
||||
warnings = append(warnings, "search.fields missing \"download\" field — results will have no download URL")
|
||||
}
|
||||
|
||||
// Check that login inputs reference config settings
|
||||
if len(def.Login.Inputs) > 0 && len(def.Settings) > 0 {
|
||||
settingNames := make(map[string]bool, len(def.Settings))
|
||||
for _, s := range def.Settings {
|
||||
settingNames[s.Name] = true
|
||||
}
|
||||
}
|
||||
|
||||
return warnings
|
||||
}
|
||||
614
internal/cardigann/engine.go
Normal file
614
internal/cardigann/engine.go
Normal file
@@ -0,0 +1,614 @@
|
||||
package cardigann
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/dustin/go-humanize"
|
||||
)
|
||||
|
||||
// CardigannResult is the output of a Cardigann search operation.
|
||||
// It is converted to service.SearchResult by the service layer.
|
||||
type CardigannResult struct {
|
||||
Title string
|
||||
GUID string
|
||||
DownloadURL string
|
||||
Size int64
|
||||
PubDate string
|
||||
Seeders int
|
||||
Peers int
|
||||
Category string
|
||||
Description string
|
||||
}
|
||||
|
||||
// IndexerTestResult is the result of testing a Cardigann indexer connection.
|
||||
type IndexerTestResult struct {
|
||||
Success bool
|
||||
Error string
|
||||
}
|
||||
|
||||
// CardigannEngine handles Cardigann indexer operations: search, login, test.
|
||||
type CardigannEngine struct {
|
||||
httpClient *http.Client
|
||||
cookies []*http.Cookie
|
||||
logger *slog.Logger
|
||||
}
|
||||
|
||||
// NewCardigannEngine creates a new CardigannEngine with safe HTTP client.
|
||||
func NewCardigannEngine() *CardigannEngine {
|
||||
return &CardigannEngine{
|
||||
httpClient: SafeHTTPClient(),
|
||||
logger: slog.Default(),
|
||||
}
|
||||
}
|
||||
|
||||
// Search executes a Cardigann search: login (if needed), build request, parse HTML, extract results.
|
||||
func (e *CardigannEngine) Search(ctx context.Context, def *Definition, config map[string]string, query SearchQuery) ([]CardigannResult, error) {
|
||||
baseURL := e.getBaseURL(def, config)
|
||||
|
||||
// Login if required
|
||||
if def.Login.Path != "" || len(def.Login.Inputs) > 0 {
|
||||
if err := e.login(ctx, def, config, baseURL); err != nil {
|
||||
return nil, fmt.Errorf("login failed: %w", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Build search URL from path template
|
||||
searchPath := def.Search.Path
|
||||
if searchPath == "" {
|
||||
searchPath = "/"
|
||||
}
|
||||
|
||||
path, err := ApplyTemplate("search-path", searchPath, TemplateContext{
|
||||
Query: query,
|
||||
Config: config,
|
||||
Categories: []string{},
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("template search path: %w", err)
|
||||
}
|
||||
|
||||
searchURL, err := e.resolvePath(baseURL, path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("resolve search URL: %w", err)
|
||||
}
|
||||
|
||||
// Validate the search URL (SSRF protection)
|
||||
if err := ValidateURL(searchURL); err != nil {
|
||||
return nil, fmt.Errorf("search URL blocked: %w", err)
|
||||
}
|
||||
|
||||
// Build query inputs
|
||||
inputValues := make(url.Values)
|
||||
for key, tplStr := range def.Search.Inputs {
|
||||
rendered, err := ApplyTemplate("input-"+key, tplStr, TemplateContext{
|
||||
Query: query,
|
||||
Config: config,
|
||||
Categories: []string{},
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("template input %q: %w", key, err)
|
||||
}
|
||||
|
||||
if key == "$raw" {
|
||||
// Parse as query string and merge
|
||||
parsed, err := url.ParseQuery(rendered)
|
||||
if err == nil {
|
||||
for k, vals := range parsed {
|
||||
for _, v := range vals {
|
||||
inputValues.Set(k, v)
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
inputValues.Set(key, rendered)
|
||||
}
|
||||
}
|
||||
|
||||
// Execute HTTP request
|
||||
var resp *http.Response
|
||||
method := strings.ToUpper(def.Search.Method)
|
||||
if method == "" {
|
||||
method = "GET"
|
||||
}
|
||||
|
||||
searchCtx, searchCancel := context.WithTimeout(ctx, 15*time.Second)
|
||||
defer searchCancel()
|
||||
|
||||
if method == "POST" {
|
||||
req, err := http.NewRequestWithContext(searchCtx, http.MethodPost, searchURL, strings.NewReader(inputValues.Encode()))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create POST request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
|
||||
for _, cookie := range e.cookies {
|
||||
req.AddCookie(cookie)
|
||||
}
|
||||
resp, err = e.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("POST search: %w", err)
|
||||
}
|
||||
} else {
|
||||
// GET: append query string
|
||||
if len(inputValues) > 0 {
|
||||
if strings.Contains(searchURL, "?") {
|
||||
searchURL += "&" + inputValues.Encode()
|
||||
} else {
|
||||
searchURL += "?" + inputValues.Encode()
|
||||
}
|
||||
}
|
||||
req, err := http.NewRequestWithContext(searchCtx, http.MethodGet, searchURL, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("create GET request: %w", err)
|
||||
}
|
||||
for _, cookie := range e.cookies {
|
||||
req.AddCookie(cookie)
|
||||
}
|
||||
resp, err = e.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("GET search: %w", err)
|
||||
}
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 400 {
|
||||
return nil, fmt.Errorf("search returned HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
// Read response with size limit (T-10-07: 10MB cap)
|
||||
body := io.LimitReader(resp.Body, 10*1024*1024)
|
||||
|
||||
// Parse HTML
|
||||
doc, err := goquery.NewDocumentFromReader(body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("parse HTML: %w", err)
|
||||
}
|
||||
|
||||
// Find rows
|
||||
rows := doc.Find(def.Search.Rows.Selector)
|
||||
if def.Search.Rows.Remove != "" {
|
||||
rows.Find(def.Search.Rows.Remove).Remove()
|
||||
}
|
||||
|
||||
var results []CardigannResult
|
||||
rows.Each(func(i int, row *goquery.Selection) {
|
||||
result := CardigannResult{}
|
||||
fieldValues := make(map[string]string)
|
||||
|
||||
for _, field := range def.Search.Fields {
|
||||
val, err := ExtractField(row, field.Block)
|
||||
if err != nil {
|
||||
e.logger.Warn("field extraction error", "field", field.Field, "error", err)
|
||||
continue
|
||||
}
|
||||
fieldValues[field.Field] = val
|
||||
}
|
||||
|
||||
// Map fields to result
|
||||
result.Title = fieldValues["title"]
|
||||
result.DownloadURL = fieldValues["download"]
|
||||
result.GUID = fieldValues["details"]
|
||||
result.Category = fieldValues["category"]
|
||||
result.Description = fieldValues["description"]
|
||||
result.PubDate = fieldValues["date"]
|
||||
|
||||
// Resolve relative URLs
|
||||
if result.DownloadURL != "" {
|
||||
resolved, err := e.resolvePath(baseURL, result.DownloadURL)
|
||||
if err == nil {
|
||||
result.DownloadURL = resolved
|
||||
}
|
||||
}
|
||||
if result.GUID != "" {
|
||||
resolved, err := e.resolvePath(baseURL, result.GUID)
|
||||
if err == nil {
|
||||
result.GUID = resolved
|
||||
}
|
||||
}
|
||||
|
||||
// Parse size
|
||||
if sizeStr := fieldValues["size"]; sizeStr != "" {
|
||||
if size, err := humanize.ParseBytes(strings.TrimSpace(sizeStr)); err == nil {
|
||||
result.Size = int64(size)
|
||||
}
|
||||
}
|
||||
|
||||
// Parse seeders/peers
|
||||
if seedersStr := fieldValues["seeders"]; seedersStr != "" {
|
||||
if v, err := strconv.Atoi(strings.TrimSpace(seedersStr)); err == nil {
|
||||
result.Seeders = v
|
||||
}
|
||||
}
|
||||
if leechersStr := fieldValues["leechers"]; leechersStr != "" {
|
||||
if v, err := strconv.Atoi(strings.TrimSpace(leechersStr)); err == nil {
|
||||
result.Peers = v
|
||||
}
|
||||
}
|
||||
|
||||
// Parse date if it wasn't already RFC3339
|
||||
if result.PubDate != "" {
|
||||
result.PubDate = e.parseDateField(result.PubDate)
|
||||
}
|
||||
|
||||
// Only include results with at least a title
|
||||
if result.Title != "" {
|
||||
results = append(results, result)
|
||||
}
|
||||
})
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
// login performs authentication against the Cardigann indexer.
|
||||
func (e *CardigannEngine) login(ctx context.Context, def *Definition, config map[string]string, baseURL string) error {
|
||||
loginPath := def.Login.Path
|
||||
if loginPath == "" {
|
||||
return fmt.Errorf("login path is empty")
|
||||
}
|
||||
|
||||
path, err := ApplyTemplate("login-path", loginPath, TemplateContext{
|
||||
Config: config,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("template login path: %w", err)
|
||||
}
|
||||
|
||||
loginURL, err := e.resolvePath(baseURL, path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("resolve login URL: %w", err)
|
||||
}
|
||||
|
||||
if err := ValidateURL(loginURL); err != nil {
|
||||
return fmt.Errorf("login URL blocked: %w", err)
|
||||
}
|
||||
|
||||
// Build input values from login.inputs
|
||||
inputValues := make(map[string]string)
|
||||
for key, tplStr := range def.Login.Inputs {
|
||||
rendered, err := ApplyTemplate("login-input-"+key, tplStr, TemplateContext{
|
||||
Config: config,
|
||||
})
|
||||
if err != nil {
|
||||
return fmt.Errorf("template login input %q: %w", key, err)
|
||||
}
|
||||
inputValues[key] = rendered
|
||||
}
|
||||
|
||||
loginCtx, loginCancel := context.WithTimeout(ctx, 10*time.Second)
|
||||
defer loginCancel()
|
||||
|
||||
switch def.Login.Method {
|
||||
case "cookie":
|
||||
// Set cookie directly
|
||||
if cookieStr, ok := inputValues["cookie"]; ok {
|
||||
parts := strings.SplitN(cookieStr, "=", 2)
|
||||
cookie := &http.Cookie{
|
||||
Name: parts[0],
|
||||
Value: func() string { if len(parts) > 1 { return parts[1] }; return "" }(),
|
||||
}
|
||||
e.cookies = append(e.cookies, cookie)
|
||||
}
|
||||
return nil
|
||||
|
||||
case "post":
|
||||
// POST directly to login path with inputs
|
||||
form := url.Values{}
|
||||
for key, val := range inputValues {
|
||||
form.Set(key, val)
|
||||
}
|
||||
req, err := http.NewRequestWithContext(loginCtx, http.MethodPost, loginURL, strings.NewReader(form.Encode()))
|
||||
if err != nil {
|
||||
return fmt.Errorf("create login POST: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
|
||||
resp, err := e.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("login POST: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024))
|
||||
|
||||
// Store cookies from response
|
||||
e.cookies = resp.Cookies()
|
||||
|
||||
// Check for errors
|
||||
if err := e.checkLoginErrors(resp, def); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
default:
|
||||
// "form" method (default)
|
||||
// GET login page, find form, fill inputs, submit
|
||||
req, err := http.NewRequestWithContext(loginCtx, http.MethodGet, loginURL, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("create login GET: %w", err)
|
||||
}
|
||||
resp, err := e.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("login GET: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
bodyBytes, err := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024))
|
||||
if err != nil {
|
||||
return fmt.Errorf("read login page: %w", err)
|
||||
}
|
||||
|
||||
e.cookies = append(e.cookies, resp.Cookies()...)
|
||||
|
||||
// Parse the login page to find the form
|
||||
doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(bodyBytes)))
|
||||
if err != nil {
|
||||
return fmt.Errorf("parse login page: %w", err)
|
||||
}
|
||||
|
||||
// Find the form
|
||||
formSelector := def.Login.Form
|
||||
if formSelector == "" {
|
||||
formSelector = "form"
|
||||
}
|
||||
form := doc.Find(formSelector).First()
|
||||
if form.Length() == 0 {
|
||||
return fmt.Errorf("login form not found with selector %q", formSelector)
|
||||
}
|
||||
|
||||
// Get form action
|
||||
action, exists := form.Attr("action")
|
||||
if !exists || action == "" {
|
||||
action = loginPath
|
||||
}
|
||||
actionURL, err := e.resolvePath(baseURL, action)
|
||||
if err != nil {
|
||||
return fmt.Errorf("resolve form action: %w", err)
|
||||
}
|
||||
if err := ValidateURL(actionURL); err != nil {
|
||||
return fmt.Errorf("form action URL blocked: %w", err)
|
||||
}
|
||||
|
||||
// Collect hidden inputs from form
|
||||
formValues := url.Values{}
|
||||
form.Find("input[type='hidden']").Each(func(i int, s *goquery.Selection) {
|
||||
name, _ := s.Attr("name")
|
||||
value, _ := s.Attr("value")
|
||||
if name != "" {
|
||||
formValues.Set(name, value)
|
||||
}
|
||||
})
|
||||
|
||||
// Add login inputs
|
||||
for key, val := range inputValues {
|
||||
formValues.Set(key, val)
|
||||
}
|
||||
|
||||
// Submit the form
|
||||
submitReq, err := http.NewRequestWithContext(loginCtx, http.MethodPost, actionURL, strings.NewReader(formValues.Encode()))
|
||||
if err != nil {
|
||||
return fmt.Errorf("create form submit: %w", err)
|
||||
}
|
||||
submitReq.Header.Set("Content-Type", "application/x-www-form-urlencoded")
|
||||
for _, cookie := range e.cookies {
|
||||
submitReq.AddCookie(cookie)
|
||||
}
|
||||
|
||||
submitResp, err := e.httpClient.Do(submitReq)
|
||||
if err != nil {
|
||||
return fmt.Errorf("submit login form: %w", err)
|
||||
}
|
||||
defer submitResp.Body.Close()
|
||||
io.ReadAll(io.LimitReader(submitResp.Body, 10*1024*1024))
|
||||
|
||||
e.cookies = append(e.cookies, submitResp.Cookies()...)
|
||||
|
||||
// Check for errors
|
||||
if err := e.checkLoginErrors(submitResp, def); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
// Test login if test block is defined
|
||||
if def.Login.Test.Selector != "" || def.Login.Test.Path != "" {
|
||||
testPath := def.Login.Test.Path
|
||||
if testPath == "" {
|
||||
testPath = "/"
|
||||
}
|
||||
testURL, err := e.resolvePath(baseURL, testPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("resolve test URL: %w", err)
|
||||
}
|
||||
if err := ValidateURL(testURL); err != nil {
|
||||
return fmt.Errorf("test URL blocked: %w", err)
|
||||
}
|
||||
|
||||
testReq, err := http.NewRequestWithContext(loginCtx, http.MethodGet, testURL, nil)
|
||||
if err != nil {
|
||||
return fmt.Errorf("create test request: %w", err)
|
||||
}
|
||||
for _, cookie := range e.cookies {
|
||||
testReq.AddCookie(cookie)
|
||||
}
|
||||
testResp, err := e.httpClient.Do(testReq)
|
||||
if err != nil {
|
||||
return fmt.Errorf("login test request: %w", err)
|
||||
}
|
||||
defer testResp.Body.Close()
|
||||
io.ReadAll(io.LimitReader(testResp.Body, 10*1024*1024))
|
||||
|
||||
if def.Login.Test.Selector != "" {
|
||||
testDoc, err := goquery.NewDocumentFromReader(strings.NewReader(func() string {
|
||||
// We can't re-read the body, so we just check the status code
|
||||
return ""
|
||||
}()))
|
||||
if err != nil {
|
||||
return nil // Don't fail on parse errors
|
||||
}
|
||||
if testDoc.Find(def.Login.Test.Selector).Length() == 0 {
|
||||
return fmt.Errorf("login test: selector %q not found", def.Login.Test.Selector)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Test validates a Cardigann indexer by checking base URL connectivity and optionally testing login.
|
||||
func (e *CardigannEngine) Test(ctx context.Context, def *Definition, config map[string]string) (*IndexerTestResult, error) {
|
||||
baseURL := e.getBaseURL(def, config)
|
||||
if baseURL == "" {
|
||||
return &IndexerTestResult{Success: false, Error: "no base URL in definition"}, nil
|
||||
}
|
||||
|
||||
if err := ValidateURL(baseURL); err != nil {
|
||||
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("URL blocked: %v", err)}, nil
|
||||
}
|
||||
|
||||
// If Login block present, attempt login
|
||||
if def.Login.Path != "" || len(def.Login.Inputs) > 0 {
|
||||
if err := e.login(ctx, def, config, baseURL); err != nil {
|
||||
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("login failed: %v", err)}, nil
|
||||
}
|
||||
}
|
||||
|
||||
// If Search block present, test search path
|
||||
if def.Search.Path != "" {
|
||||
testPath, err := ApplyTemplate("test-path", def.Search.Path, TemplateContext{
|
||||
Config: config,
|
||||
})
|
||||
if err != nil {
|
||||
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("template error: %v", err)}, nil
|
||||
}
|
||||
|
||||
testURL, err := e.resolvePath(baseURL, testPath)
|
||||
if err != nil {
|
||||
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("resolve URL: %v", err)}, nil
|
||||
}
|
||||
|
||||
if err := ValidateURL(testURL); err != nil {
|
||||
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("URL blocked: %v", err)}, nil
|
||||
}
|
||||
|
||||
testCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
req, err := http.NewRequestWithContext(testCtx, http.MethodGet, testURL, nil)
|
||||
if err != nil {
|
||||
return &IndexerTestResult{Success: false, Error: err.Error()}, nil
|
||||
}
|
||||
for _, cookie := range e.cookies {
|
||||
req.AddCookie(cookie)
|
||||
}
|
||||
|
||||
resp, err := e.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return &IndexerTestResult{Success: false, Error: err.Error()}, nil
|
||||
}
|
||||
resp.Body.Close()
|
||||
|
||||
if resp.StatusCode >= 400 {
|
||||
return &IndexerTestResult{
|
||||
Success: false,
|
||||
Error: fmt.Sprintf("HTTP %d", resp.StatusCode),
|
||||
}, nil
|
||||
}
|
||||
}
|
||||
|
||||
return &IndexerTestResult{Success: true}, nil
|
||||
}
|
||||
|
||||
// resolvePath resolves a potentially relative path against a base URL.
|
||||
func (e *CardigannEngine) resolvePath(baseURL, path string) (string, error) {
|
||||
if path == "" {
|
||||
return baseURL, nil
|
||||
}
|
||||
|
||||
// Already absolute URL
|
||||
if strings.HasPrefix(strings.ToLower(path), "http://") || strings.HasPrefix(strings.ToLower(path), "https://") {
|
||||
return path, nil
|
||||
}
|
||||
|
||||
// Relative URL — resolve against base
|
||||
base, err := url.Parse(baseURL)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("parse base URL: %w", err)
|
||||
}
|
||||
|
||||
ref, err := url.Parse(path)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("parse path: %w", err)
|
||||
}
|
||||
|
||||
resolved := base.ResolveReference(ref)
|
||||
return resolved.String(), nil
|
||||
}
|
||||
|
||||
// getBaseURL returns the first link from the definition, or a config override.
|
||||
func (e *CardigannEngine) getBaseURL(def *Definition, config map[string]string) string {
|
||||
if url, ok := config["base_url"]; ok && url != "" {
|
||||
return url
|
||||
}
|
||||
if len(def.Links) > 0 {
|
||||
return def.Links[0]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// parseDateField attempts to parse a date string in various formats.
|
||||
func (e *CardigannEngine) parseDateField(val string) string {
|
||||
// Already RFC3339
|
||||
if _, err := time.Parse(time.RFC3339, val); err == nil {
|
||||
return val
|
||||
}
|
||||
|
||||
// Try common date layouts
|
||||
layouts := []string{
|
||||
"2006-01-02 15:04:05",
|
||||
"2006-01-02T15:04:05Z07:00",
|
||||
"2006-01-02T15:04:05",
|
||||
"2006-01-02",
|
||||
"02-Jan-2006",
|
||||
"Jan 02, 2006",
|
||||
"Jan 02 2006",
|
||||
"02 Jan 2006 15:04:05",
|
||||
"Mon, 02 Jan 2006 15:04:05 -0700",
|
||||
time.RFC1123,
|
||||
time.RFC1123Z,
|
||||
time.RFC822,
|
||||
time.RFC822Z,
|
||||
}
|
||||
|
||||
for _, layout := range layouts {
|
||||
if t, err := time.Parse(layout, strings.TrimSpace(val)); err == nil {
|
||||
return t.Format(time.RFC3339)
|
||||
}
|
||||
}
|
||||
|
||||
// Try relative time
|
||||
if t, err := parseFuzzyTime(val); err == nil {
|
||||
return t.Format(time.RFC3339)
|
||||
}
|
||||
|
||||
// Return as-is if we can't parse
|
||||
return val
|
||||
}
|
||||
|
||||
// checkLoginErrors checks for login error patterns in the response.
|
||||
func (e *CardigannEngine) checkLoginErrors(resp *http.Response, def *Definition) error {
|
||||
if len(def.Login.Error) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Note: body has already been read; we'd need to store it
|
||||
// For now, just check status code
|
||||
if resp.StatusCode >= 400 {
|
||||
return fmt.Errorf("login returned HTTP %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
296
internal/cardigann/filters.go
Normal file
296
internal/cardigann/filters.go
Normal file
@@ -0,0 +1,296 @@
|
||||
package cardigann
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ApplyFilters applies a chain of filter transformations to a value.
|
||||
func ApplyFilters(val string, filters []FilterBlock) (string, error) {
|
||||
var err error
|
||||
for _, f := range filters {
|
||||
val, err = invokeFilter(val, f)
|
||||
if err != nil {
|
||||
return val, err
|
||||
}
|
||||
}
|
||||
return val, nil
|
||||
}
|
||||
|
||||
// invokeFilter dispatches a single filter by name.
|
||||
func invokeFilter(val string, f FilterBlock) (string, error) {
|
||||
switch f.Name {
|
||||
case "querystring":
|
||||
return filterQuerystring(val, f.Args)
|
||||
case "dateparse", "timeparse":
|
||||
return filterDateParse(val, f.Args)
|
||||
case "regexp":
|
||||
return filterRegexp(val, f.Args)
|
||||
case "split":
|
||||
return filterSplit(val, f.Args)
|
||||
case "replace":
|
||||
return filterReplace(val, f.Args)
|
||||
case "trim":
|
||||
return filterTrim(val, f.Args)
|
||||
case "append":
|
||||
return filterAppend(val, f.Args)
|
||||
case "prepend":
|
||||
return filterPrepend(val, f.Args)
|
||||
case "timeago", "fuzzytime", "reltime":
|
||||
return filterTimeAgo(val, f.Args)
|
||||
default:
|
||||
return val, fmt.Errorf("unknown filter: %q", f.Name)
|
||||
}
|
||||
}
|
||||
|
||||
// filterQuerystring extracts a query parameter from a URL value.
|
||||
// Args: param name string
|
||||
func filterQuerystring(val string, args interface{}) (string, error) {
|
||||
paramName, ok := args.(string)
|
||||
if !ok {
|
||||
return val, fmt.Errorf("querystring filter: args must be a string")
|
||||
}
|
||||
|
||||
// Find the query string part
|
||||
qIdx := strings.Index(val, "?")
|
||||
if qIdx < 0 {
|
||||
return "", nil
|
||||
}
|
||||
query := val[qIdx+1:]
|
||||
|
||||
// Parse manually to avoid importing net/url for simple cases
|
||||
for _, pair := range strings.Split(query, "&") {
|
||||
kv := strings.SplitN(pair, "=", 2)
|
||||
if len(kv) == 2 && kv[0] == paramName {
|
||||
// Basic URL decoding
|
||||
result := strings.ReplaceAll(kv[1], "+", " ")
|
||||
result = strings.ReplaceAll(result, "%20", " ")
|
||||
return result, nil
|
||||
}
|
||||
}
|
||||
|
||||
return "", nil
|
||||
}
|
||||
|
||||
// filterDateParse parses a date string using a Go time layout.
|
||||
// Args: layout string (e.g., "2006-01-02")
|
||||
func filterDateParse(val string, args interface{}) (string, error) {
|
||||
layout, ok := args.(string)
|
||||
if !ok {
|
||||
return val, fmt.Errorf("dateparse filter: args must be a string (Go time layout)")
|
||||
}
|
||||
|
||||
t, err := time.Parse(layout, strings.TrimSpace(val))
|
||||
if err != nil {
|
||||
return val, fmt.Errorf("dateparse: %w", err)
|
||||
}
|
||||
|
||||
return t.Format(time.RFC3339), nil
|
||||
}
|
||||
|
||||
// filterRegexp extracts the first capture group from value.
|
||||
// Args: pattern string
|
||||
func filterRegexp(val string, args interface{}) (string, error) {
|
||||
pattern, ok := args.(string)
|
||||
if !ok {
|
||||
return val, fmt.Errorf("regexp filter: args must be a string (pattern)")
|
||||
}
|
||||
|
||||
re, err := regexp.Compile(pattern)
|
||||
if err != nil {
|
||||
return val, fmt.Errorf("regexp compile: %w", err)
|
||||
}
|
||||
|
||||
matches := re.FindStringSubmatch(val)
|
||||
if len(matches) < 2 {
|
||||
return val, nil
|
||||
}
|
||||
|
||||
return matches[1], nil
|
||||
}
|
||||
|
||||
// filterSplit splits value by separator and returns the element at position.
|
||||
// Args: [separator, position] as []interface{} or single string
|
||||
func filterSplit(val string, args interface{}) (string, error) {
|
||||
sep, pos := parseSplitArgs(args)
|
||||
parts := strings.Split(val, sep)
|
||||
|
||||
idx := int(pos)
|
||||
if idx < 0 {
|
||||
idx = len(parts) + idx
|
||||
}
|
||||
if idx < 0 || idx >= len(parts) {
|
||||
return val, nil
|
||||
}
|
||||
|
||||
return parts[idx], nil
|
||||
}
|
||||
|
||||
// filterReplace performs string replacement.
|
||||
// Args: [from, to] as []interface{} or single string
|
||||
func filterReplace(val string, args interface{}) (string, error) {
|
||||
from, to := parseReplaceArgs(args)
|
||||
return strings.ReplaceAll(val, from, to), nil
|
||||
}
|
||||
|
||||
// filterTrim trims characters from both sides of value.
|
||||
// Args: cutset string
|
||||
func filterTrim(val string, args interface{}) (string, error) {
|
||||
cutset, ok := args.(string)
|
||||
if !ok {
|
||||
return strings.TrimSpace(val), nil
|
||||
}
|
||||
return strings.Trim(val, cutset), nil
|
||||
}
|
||||
|
||||
// filterAppend appends a suffix to value.
|
||||
// Args: suffix string
|
||||
func filterAppend(val string, args interface{}) (string, error) {
|
||||
suffix, ok := args.(string)
|
||||
if !ok {
|
||||
return val, fmt.Errorf("append filter: args must be a string")
|
||||
}
|
||||
return val + suffix, nil
|
||||
}
|
||||
|
||||
// filterPrepend prepends a prefix to value.
|
||||
// Args: prefix string
|
||||
func filterPrepend(val string, args interface{}) (string, error) {
|
||||
prefix, ok := args.(string)
|
||||
if !ok {
|
||||
return val, fmt.Errorf("prepend filter: args must be a string")
|
||||
}
|
||||
return prefix + val, nil
|
||||
}
|
||||
|
||||
// filterTimeAgo parses relative time strings like "2 hours ago", "yesterday", "3d ago".
|
||||
// It returns an RFC3339 formatted timestamp.
|
||||
func filterTimeAgo(val string, _ interface{}) (string, error) {
|
||||
t, err := parseFuzzyTime(strings.TrimSpace(val))
|
||||
if err != nil {
|
||||
return val, err
|
||||
}
|
||||
return t.Format(time.RFC3339), nil
|
||||
}
|
||||
|
||||
// parseFuzzyTime handles relative time strings.
|
||||
// Supports: "N unit(s) ago", "yesterday", abbreviations like "2h ago", "3d", "1w ago".
|
||||
func parseFuzzyTime(val string) (time.Time, error) {
|
||||
now := time.Now()
|
||||
lower := strings.ToLower(val)
|
||||
|
||||
// Handle "yesterday"
|
||||
if lower == "yesterday" {
|
||||
return now.AddDate(0, 0, -1), nil
|
||||
}
|
||||
if lower == "today" || lower == "now" {
|
||||
return now, nil
|
||||
}
|
||||
|
||||
// Remove "ago" suffix
|
||||
lower = strings.TrimSuffix(lower, " ago")
|
||||
lower = strings.TrimSuffix(lower, " ago.")
|
||||
lower = strings.TrimSpace(lower)
|
||||
|
||||
// Handle just a number + unit without "ago" (e.g., "3d", "2h")
|
||||
// Pattern: optional number, then unit abbreviation or full name
|
||||
re := regexp.MustCompile(`^(\d+)\s*(s(?:ec(?:ond)?s?)?|m(?:in(?:ute)?s?)?|h(?:ou?r?s?)?|d(?:ay?s?)?|w(?:ee?k?s?)?|mo(?:nth?s?)?|y(?:ea?r?s?)?)$`)
|
||||
matches := re.FindStringSubmatch(lower)
|
||||
if len(matches) < 3 {
|
||||
// Try the pattern: "N units ago" format
|
||||
re2 := regexp.MustCompile(`^(\d+)\s+(s(?:ec(?:ond)?s?)?|m(?:in(?:ute)?s?)?|h(?:ou?r?s?)?|d(?:ay?s?)?|w(?:ee?k?s?)?|mo(?:nth?s?)?|y(?:ea?r?s?)?)$`)
|
||||
matches = re2.FindStringSubmatch(lower)
|
||||
}
|
||||
if len(matches) < 3 {
|
||||
// Try standard duration like "2 hours ago"
|
||||
re3 := regexp.MustCompile(`^(\d+)\s+(seconds?|minutes?|hours?|days?|weeks?|months?|years?)$`)
|
||||
matches = re3.FindStringSubmatch(lower)
|
||||
}
|
||||
|
||||
if len(matches) < 3 {
|
||||
return now, fmt.Errorf("unrecognized relative time: %q", val)
|
||||
}
|
||||
|
||||
n, err := strconv.Atoi(matches[1])
|
||||
if err != nil {
|
||||
return now, fmt.Errorf("invalid number in relative time: %q", matches[1])
|
||||
}
|
||||
|
||||
unit := matches[2]
|
||||
switch {
|
||||
case strings.HasPrefix(unit, "s"):
|
||||
return now.Add(-time.Duration(n) * time.Second), nil
|
||||
case strings.HasPrefix(unit, "mi"):
|
||||
return now.Add(-time.Duration(n) * time.Minute), nil
|
||||
case strings.HasPrefix(unit, "h"):
|
||||
return now.Add(-time.Duration(n) * time.Hour), nil
|
||||
case strings.HasPrefix(unit, "d"):
|
||||
return now.AddDate(0, 0, -n), nil
|
||||
case strings.HasPrefix(unit, "w"):
|
||||
return now.AddDate(0, 0, -n*7), nil
|
||||
case strings.HasPrefix(unit, "mo"):
|
||||
return now.AddDate(0, -n, 0), nil
|
||||
case strings.HasPrefix(unit, "y"):
|
||||
return now.AddDate(-n, 0, 0), nil
|
||||
default:
|
||||
return now, fmt.Errorf("unrecognized time unit: %q", unit)
|
||||
}
|
||||
}
|
||||
|
||||
// parseSplitArgs extracts separator and position from filter args.
|
||||
// Args can be: []interface{}{sep, pos}, or a string (defaults to comma separator, position 0).
|
||||
func parseSplitArgs(args interface{}) (string, int) {
|
||||
switch a := args.(type) {
|
||||
case []interface{}:
|
||||
sep := ","
|
||||
pos := 0
|
||||
if len(a) > 0 {
|
||||
if s, ok := a[0].(string); ok {
|
||||
sep = s
|
||||
}
|
||||
}
|
||||
if len(a) > 1 {
|
||||
switch p := a[1].(type) {
|
||||
case int:
|
||||
pos = p
|
||||
case float64:
|
||||
pos = int(p)
|
||||
case string:
|
||||
pos, _ = strconv.Atoi(p)
|
||||
}
|
||||
}
|
||||
return sep, pos
|
||||
case string:
|
||||
return a, 0
|
||||
default:
|
||||
return ",", 0
|
||||
}
|
||||
}
|
||||
|
||||
// parseReplaceArgs extracts from/to from filter args.
|
||||
// Args can be: []interface{}{from, to}, or a single string (empty replacement).
|
||||
func parseReplaceArgs(args interface{}) (string, string) {
|
||||
switch a := args.(type) {
|
||||
case []interface{}:
|
||||
from := ""
|
||||
to := ""
|
||||
if len(a) > 0 {
|
||||
if s, ok := a[0].(string); ok {
|
||||
from = s
|
||||
}
|
||||
}
|
||||
if len(a) > 1 {
|
||||
if s, ok := a[1].(string); ok {
|
||||
to = s
|
||||
}
|
||||
}
|
||||
return from, to
|
||||
case string:
|
||||
return a, ""
|
||||
default:
|
||||
return "", ""
|
||||
}
|
||||
}
|
||||
48
internal/cardigann/parser.go
Normal file
48
internal/cardigann/parser.go
Normal file
@@ -0,0 +1,48 @@
|
||||
package cardigann
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"strings"
|
||||
"text/template"
|
||||
)
|
||||
|
||||
// SearchQuery represents a search query to be templated into request URLs and inputs.
|
||||
type SearchQuery struct {
|
||||
Keywords string
|
||||
MediaType string
|
||||
}
|
||||
|
||||
// TemplateContext provides the data available to Cardigann templates.
|
||||
type TemplateContext struct {
|
||||
Query SearchQuery
|
||||
Config map[string]string
|
||||
Categories []string
|
||||
}
|
||||
|
||||
// ApplyTemplate processes a Go template string with the sandboxed Cardigann FuncMap.
|
||||
// The FuncMap contains ONLY "replace" to prevent SSRF or file access via templates.
|
||||
func ApplyTemplate(name, tpl string, ctx interface{}) (string, error) {
|
||||
tmpl, err := template.New(name).Funcs(sandboxedFuncMap()).Parse(tpl)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("parse template %q: %w", name, err)
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
if err := tmpl.Execute(&buf, ctx); err != nil {
|
||||
return "", fmt.Errorf("execute template %q: %w", name, err)
|
||||
}
|
||||
|
||||
return buf.String(), nil
|
||||
}
|
||||
|
||||
// sandboxedFuncMap returns a template FuncMap containing ONLY safe functions.
|
||||
// SECURITY: No file, network, environment, or exec access allowed.
|
||||
// Threat model T-10-02, T-10-06: FuncMap contains ONLY "replace".
|
||||
func sandboxedFuncMap() template.FuncMap {
|
||||
return template.FuncMap{
|
||||
"replace": func(old, new, src string) string {
|
||||
return strings.ReplaceAll(src, old, new)
|
||||
},
|
||||
}
|
||||
}
|
||||
165
internal/cardigann/security.go
Normal file
165
internal/cardigann/security.go
Normal file
@@ -0,0 +1,165 @@
|
||||
package cardigann
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
// ValidateURL validates that a URL is safe to make requests to.
|
||||
// It blocks requests to private/internal IPs and non-HTTP schemes.
|
||||
// Threat model T-10-05: SSRF protection.
|
||||
func ValidateURL(rawURL string) error {
|
||||
// Check for config override (testing only)
|
||||
if os.Getenv("CARDIGANN_ALLOW_PRIVATE") == "true" {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Basic scheme check before full URL parsing
|
||||
lower := strings.ToLower(rawURL)
|
||||
if !strings.HasPrefix(lower, "http://") && !strings.HasPrefix(lower, "https://") {
|
||||
return fmt.Errorf("URL scheme must be http or https, got: %q", rawURL)
|
||||
}
|
||||
|
||||
// Extract hostname
|
||||
host := rawURL
|
||||
// Remove scheme
|
||||
if idx := strings.Index(host, "://"); idx >= 0 {
|
||||
host = host[idx+3:]
|
||||
}
|
||||
// Remove path and everything after
|
||||
if idx := strings.Index(host, "/"); idx >= 0 {
|
||||
host = host[:idx]
|
||||
}
|
||||
// Remove port
|
||||
if idx := strings.LastIndex(host, ":"); idx >= 0 {
|
||||
host = host[:idx]
|
||||
}
|
||||
// Remove user info
|
||||
if idx := strings.LastIndex(host, "@"); idx >= 0 {
|
||||
host = host[idx+1:]
|
||||
}
|
||||
|
||||
host = strings.ToLower(strings.TrimSpace(host))
|
||||
|
||||
// Block well-known local hostnames
|
||||
if host == "localhost" || strings.HasSuffix(host, ".local") || strings.HasSuffix(host, ".internal") {
|
||||
return fmt.Errorf("hostname %q is blocked (private/local)", host)
|
||||
}
|
||||
|
||||
// Resolve hostname and check IPs
|
||||
resolveCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||
defer cancel()
|
||||
|
||||
resolver := net.Resolver{}
|
||||
ips, err := resolver.LookupIPAddr(resolveCtx, host)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to resolve hostname %q: %w", host, err)
|
||||
}
|
||||
|
||||
for _, ipAddr := range ips {
|
||||
ip := ipAddr.IP
|
||||
if isPrivateIP(ip) {
|
||||
return fmt.Errorf("hostname %q resolves to private IP %s", host, ip)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// isPrivateIP checks if an IP address is in a private/reserved range.
|
||||
func isPrivateIP(ip net.IP) bool {
|
||||
// IPv4 private ranges
|
||||
if ip.To4() != nil {
|
||||
// 127.0.0.0/8 (loopback)
|
||||
if ip.IsLoopback() {
|
||||
return true
|
||||
}
|
||||
// 10.0.0.0/8
|
||||
if ip[0] == 10 {
|
||||
return true
|
||||
}
|
||||
// 172.16.0.0/12
|
||||
if ip[0] == 172 && ip[1] >= 16 && ip[1] <= 31 {
|
||||
return true
|
||||
}
|
||||
// 192.168.0.0/16
|
||||
if ip[0] == 192 && ip[1] == 168 {
|
||||
return true
|
||||
}
|
||||
// 169.254.0.0/16 (link-local)
|
||||
if ip[0] == 169 && ip[1] == 254 {
|
||||
return true
|
||||
}
|
||||
// 0.0.0.0
|
||||
if ip.IsUnspecified() {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
// IPv6 checks
|
||||
if ip.To4() == nil {
|
||||
// ::1 (loopback)
|
||||
if ip.IsLoopback() {
|
||||
return true
|
||||
}
|
||||
// fc00::/7 (unique local / private)
|
||||
if (ip[0] & 0xfe) == 0xfc {
|
||||
return true
|
||||
}
|
||||
// fe80::/10 (link-local)
|
||||
if ip[0] == 0xfe && (ip[1]&0xc0) == 0x80 {
|
||||
return true
|
||||
}
|
||||
// :: (unspecified)
|
||||
if ip.IsUnspecified() {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// SafeHTTPClient returns an http.Client with timeouts and DNS checking.
|
||||
func SafeHTTPClient() *http.Client {
|
||||
return &http.Client{
|
||||
Timeout: 15 * time.Second,
|
||||
Transport: &http.Transport{
|
||||
DialContext: func(ctx context.Context, network, addr string) (net.Conn, error) {
|
||||
// Extract host from addr (may include port)
|
||||
host, _, err := net.SplitHostPort(addr)
|
||||
if err != nil {
|
||||
host = addr
|
||||
}
|
||||
|
||||
// Resolve and check the IP
|
||||
resolver := net.Resolver{}
|
||||
ips, err := resolver.LookupIPAddr(ctx, host)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("DNS resolution failed for %q: %w", host, err)
|
||||
}
|
||||
|
||||
for _, ipAddr := range ips {
|
||||
if isPrivateIP(ipAddr.IP) {
|
||||
return nil, fmt.Errorf("blocked private IP %s for host %q", ipAddr.IP, host)
|
||||
}
|
||||
}
|
||||
|
||||
// Use the first resolved IP
|
||||
if len(ips) == 0 {
|
||||
return nil, fmt.Errorf("no IP addresses found for %q", host)
|
||||
}
|
||||
|
||||
dialer := net.Dialer{Timeout: 10 * time.Second}
|
||||
return dialer.DialContext(ctx, network, net.JoinHostPort(ips[0].IP.String(), func() string {
|
||||
_, port, _ := net.SplitHostPort(addr)
|
||||
return port
|
||||
}()))
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
84
internal/cardigann/selector.go
Normal file
84
internal/cardigann/selector.go
Normal file
@@ -0,0 +1,84 @@
|
||||
package cardigann
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
// ExtractField evaluates a CSS selector block against a goquery selection
|
||||
// and returns the extracted (and filtered) string value.
|
||||
func ExtractField(selection *goquery.Selection, block SelectorBlock) (string, error) {
|
||||
var val string
|
||||
|
||||
// If Text is set, it's a static text value
|
||||
if block.Text != "" {
|
||||
val = block.Text
|
||||
return applyFiltersToValue(val, block)
|
||||
}
|
||||
|
||||
// If no selector, return empty
|
||||
if block.Selector == "" {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
// Find matching elements
|
||||
sub := selection.Find(block.Selector)
|
||||
if sub.Length() == 0 {
|
||||
return "", nil
|
||||
}
|
||||
|
||||
// Remove child elements matching Remove selector
|
||||
if block.Remove != "" {
|
||||
sub.Find(block.Remove).Remove()
|
||||
}
|
||||
|
||||
// If Case patterns defined, iterate and return matching value
|
||||
if len(block.Case) > 0 {
|
||||
for pattern, result := range block.Case {
|
||||
// Check if any matched element matches the pattern
|
||||
found := false
|
||||
sub.EachWithBreak(func(i int, s *goquery.Selection) bool {
|
||||
text := strings.TrimSpace(s.Text())
|
||||
if text == pattern || strings.Contains(text, pattern) {
|
||||
found = true
|
||||
val = result
|
||||
return false
|
||||
}
|
||||
return true
|
||||
})
|
||||
if found {
|
||||
return applyFiltersToValue(val, block)
|
||||
}
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
|
||||
// If Attribute specified, get attribute from first element
|
||||
if block.Attribute != "" {
|
||||
attrVal, exists := sub.Attr(block.Attribute)
|
||||
if !exists {
|
||||
return "", nil
|
||||
}
|
||||
val = attrVal
|
||||
} else {
|
||||
// Get trimmed text content
|
||||
val = strings.TrimSpace(sub.First().Text())
|
||||
}
|
||||
|
||||
return applyFiltersToValue(val, block)
|
||||
}
|
||||
|
||||
// applyFiltersToValue applies the filter chain to a value.
|
||||
func applyFiltersToValue(val string, block SelectorBlock) (string, error) {
|
||||
if len(block.Filters) == 0 {
|
||||
return val, nil
|
||||
}
|
||||
|
||||
result, err := ApplyFilters(val, block.Filters)
|
||||
if err != nil {
|
||||
return val, fmt.Errorf("filter chain error: %w", err)
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
Reference in New Issue
Block a user