Files
unified-media-manager/internal/cardigann/definition.go
2026-04-24 10:45:19 -07:00

288 lines
8.0 KiB
Go

package cardigann
import (
"fmt"
yaml "gopkg.in/yaml.v3"
)
// Definition represents a parsed Cardigann YAML indexer definition.
// It matches the upstream Cardigann schema for site definitions.
type Definition struct {
Site string `yaml:"site"`
Name string `yaml:"name"`
Description string `yaml:"description"`
Language string `yaml:"language"`
Encoding string `yaml:"encoding"`
Links StringOrSlice `yaml:"links"`
Settings []SettingsField `yaml:"settings"`
Caps CapabilitiesBlock `yaml:"caps"`
Login LoginBlock `yaml:"login"`
Ratio RatioBlock `yaml:"ratio"`
Search SearchBlock `yaml:"search"`
}
// SettingsField describes a user-configurable field in the definition.
type SettingsField struct {
Name string `yaml:"name"`
Type string `yaml:"type"`
Label string `yaml:"label"`
}
// CapabilitiesBlock maps categories and search modes.
type CapabilitiesBlock struct {
Categories map[string]string `yaml:"categories"`
Modes map[string][]string `yaml:"modes"`
}
// LoginBlock describes authentication configuration.
type LoginBlock struct {
Path string `yaml:"path"`
Method string `yaml:"method"`
Form string `yaml:"form"`
Inputs map[string]string `yaml:"inputs"`
Error []ErrorBlock `yaml:"error"`
Test PageTestBlock `yaml:"test"`
}
// ErrorBlock describes an error detection pattern.
type ErrorBlock struct {
Path string `yaml:"path"`
Selector string `yaml:"selector"`
Message SelectorBlock `yaml:"message"`
}
// PageTestBlock describes a page test for verifying login.
type PageTestBlock struct {
Path string `yaml:"path"`
Selector string `yaml:"selector"`
}
// SearchBlock describes search configuration.
type SearchBlock struct {
Path string `yaml:"path"`
Method string `yaml:"method"`
Inputs map[string]string `yaml:"inputs"`
Rows RowsBlock `yaml:"rows"`
Fields FieldsListBlock `yaml:"fields"`
}
// RowsBlock describes how to find result rows in HTML.
type RowsBlock struct {
Selector string `yaml:"selector"`
Remove string `yaml:"remove"`
After int `yaml:"after"`
DateHeaders SelectorBlock `yaml:"dateheaders"`
}
// FieldBlock represents a single field extraction definition.
type FieldBlock struct {
Field string `yaml:"field"`
Block SelectorBlock `yaml:"-"`
}
// SelectorBlock describes CSS selector extraction with optional filters.
type SelectorBlock struct {
Selector string `yaml:"selector"`
Text string `yaml:"text"`
Attribute string `yaml:"attribute"`
Remove string `yaml:"remove"`
Filters []FilterBlock `yaml:"filters"`
Case map[string]string `yaml:"case"`
}
// FilterBlock represents a filter transformation.
type FilterBlock struct {
Name string `yaml:"name"`
Args interface{} `yaml:"args"`
}
// RatioBlock describes ratio display configuration.
type RatioBlock struct {
Selector string `yaml:"selector"`
Path string `yaml:"path"`
}
// StringOrSlice is a custom type that accepts either a string or a slice of strings in YAML.
type StringOrSlice []string
func (s *StringOrSlice) UnmarshalYAML(value *yaml.Node) error {
var single string
if err := value.Decode(&single); err == nil {
*s = []string{single}
return nil
}
var slice []string
if err := value.Decode(&slice); err != nil {
return fmt.Errorf("expected string or list of strings: %w", err)
}
*s = slice
return nil
}
// FieldsListBlock preserves the field ordering from YAML map keys.
type FieldsListBlock []FieldBlock
func (f *FieldsListBlock) UnmarshalYAML(value *yaml.Node) error {
// Cardigann fields are a YAML map where key is field name and value is selector block.
// We use the yaml.Node directly to preserve key ordering.
if value.Kind != yaml.MappingNode {
return fmt.Errorf("fields must be a mapping")
}
result := make([]FieldBlock, 0, len(value.Content)/2)
for i := 0; i < len(value.Content); i += 2 {
keyNode := value.Content[i]
valNode := value.Content[i+1]
fieldName := keyNode.Value
// Marshal the value node back to YAML, then unmarshal into SelectorBlock
valueBytes, err := yaml.Marshal(valNode)
if err != nil {
return fmt.Errorf("failed to marshal field %q: %w", fieldName, err)
}
var block SelectorBlock
if err := yaml.Unmarshal(valueBytes, &block); err != nil {
return fmt.Errorf("failed to unmarshal field %q block: %w", fieldName, err)
}
result = append(result, FieldBlock{
Field: fieldName,
Block: block,
})
}
*f = result
return nil
}
// UnmarshalYAML sets default values for RowsBlock.
func (r *RowsBlock) UnmarshalYAML(value *yaml.Node) error {
// Use a raw type to avoid infinite recursion
type rawRows struct {
Selector string `yaml:"selector"`
Remove string `yaml:"remove"`
After int `yaml:"after"`
DateHeaders SelectorBlock `yaml:"dateheaders"`
}
var raw rawRows
if err := value.Decode(&raw); err != nil {
return err
}
r.Selector = raw.Selector
r.Remove = raw.Remove
r.After = raw.After
r.DateHeaders = raw.DateHeaders
return nil
}
// UnmarshalYAML sets default values for LoginBlock.
func (l *LoginBlock) UnmarshalYAML(value *yaml.Node) error {
type rawLogin struct {
Path string `yaml:"path"`
Method string `yaml:"method"`
Form string `yaml:"form"`
Inputs map[string]string `yaml:"inputs"`
Error []ErrorBlock `yaml:"error"`
Test PageTestBlock `yaml:"test"`
}
var raw rawLogin
if err := value.Decode(&raw); err != nil {
return err
}
l.Path = raw.Path
l.Method = raw.Method
l.Form = raw.Form
l.Inputs = raw.Inputs
l.Error = raw.Error
l.Test = raw.Test
// Apply defaults
if l.Method == "" {
l.Method = "form"
}
if l.Form == "" {
l.Form = "form"
}
return nil
}
// ParseDefinition parses raw YAML bytes into a Definition struct.
// It applies defaults and validates required fields.
func ParseDefinition(data []byte) (*Definition, error) {
var def Definition
if err := yaml.Unmarshal(data, &def); err != nil {
return nil, fmt.Errorf("parse YAML: %w", err)
}
// Apply defaults
if def.Language == "" {
def.Language = "en-us"
}
if def.Encoding == "" {
def.Encoding = "UTF-8"
}
// Validate required fields
if def.Site == "" {
return nil, fmt.Errorf("definition missing required field: site")
}
if def.Name == "" {
return nil, fmt.Errorf("definition missing required field: name")
}
if len(def.Links) == 0 {
return nil, fmt.Errorf("definition missing required field: links")
}
// Threat model T-10-04: Reject oversized definitions
if len(def.Search.Fields) > 100 {
return nil, fmt.Errorf("definition has too many search fields (%d > 100)", len(def.Search.Fields))
}
if len(def.Caps.Categories) > 1000 {
return nil, fmt.Errorf("definition has too many category mappings (%d > 1000)", len(def.Caps.Categories))
}
return &def, nil
}
// ValidateDefinition returns a list of validation warnings for a parsed definition.
// These are not errors — the definition may still be usable — but indicate potential issues.
func ValidateDefinition(def *Definition) []string {
var warnings []string
if def.Search.Rows.Selector == "" {
warnings = append(warnings, "search.rows.selector is empty — search will not find results")
}
hasTitle := false
hasDownload := false
for _, field := range def.Search.Fields {
switch field.Field {
case "title":
hasTitle = true
case "download":
hasDownload = true
}
}
if !hasTitle {
warnings = append(warnings, "search.fields missing \"title\" field — results will have no title")
}
if !hasDownload {
warnings = append(warnings, "search.fields missing \"download\" field — results will have no download URL")
}
// Check that login inputs reference config settings
if len(def.Login.Inputs) > 0 && len(def.Settings) > 0 {
settingNames := make(map[string]bool, len(def.Settings))
for _, s := range def.Settings {
settingNames[s.Name] = true
}
}
return warnings
}