package cardigann import ( "fmt" yaml "gopkg.in/yaml.v3" ) // Definition represents a parsed Cardigann YAML indexer definition. // It matches the upstream Cardigann schema for site definitions. type Definition struct { Site string `yaml:"site"` Name string `yaml:"name"` Description string `yaml:"description"` Language string `yaml:"language"` Encoding string `yaml:"encoding"` Links StringOrSlice `yaml:"links"` Settings []SettingsField `yaml:"settings"` Caps CapabilitiesBlock `yaml:"caps"` Login LoginBlock `yaml:"login"` Ratio RatioBlock `yaml:"ratio"` Search SearchBlock `yaml:"search"` } // SettingsField describes a user-configurable field in the definition. type SettingsField struct { Name string `yaml:"name"` Type string `yaml:"type"` Label string `yaml:"label"` } // CapabilitiesBlock maps categories and search modes. type CapabilitiesBlock struct { Categories map[string]string `yaml:"categories"` Modes map[string][]string `yaml:"modes"` } // LoginBlock describes authentication configuration. type LoginBlock struct { Path string `yaml:"path"` Method string `yaml:"method"` Form string `yaml:"form"` Inputs map[string]string `yaml:"inputs"` Error []ErrorBlock `yaml:"error"` Test PageTestBlock `yaml:"test"` } // ErrorBlock describes an error detection pattern. type ErrorBlock struct { Path string `yaml:"path"` Selector string `yaml:"selector"` Message SelectorBlock `yaml:"message"` } // PageTestBlock describes a page test for verifying login. type PageTestBlock struct { Path string `yaml:"path"` Selector string `yaml:"selector"` } // SearchBlock describes search configuration. type SearchBlock struct { Path string `yaml:"path"` Method string `yaml:"method"` Inputs map[string]string `yaml:"inputs"` Rows RowsBlock `yaml:"rows"` Fields FieldsListBlock `yaml:"fields"` } // RowsBlock describes how to find result rows in HTML. type RowsBlock struct { Selector string `yaml:"selector"` Remove string `yaml:"remove"` After int `yaml:"after"` DateHeaders SelectorBlock `yaml:"dateheaders"` } // FieldBlock represents a single field extraction definition. type FieldBlock struct { Field string `yaml:"field"` Block SelectorBlock `yaml:"-"` } // SelectorBlock describes CSS selector extraction with optional filters. type SelectorBlock struct { Selector string `yaml:"selector"` Text string `yaml:"text"` Attribute string `yaml:"attribute"` Remove string `yaml:"remove"` Filters []FilterBlock `yaml:"filters"` Case map[string]string `yaml:"case"` } // FilterBlock represents a filter transformation. type FilterBlock struct { Name string `yaml:"name"` Args interface{} `yaml:"args"` } // RatioBlock describes ratio display configuration. type RatioBlock struct { Selector string `yaml:"selector"` Path string `yaml:"path"` } // StringOrSlice is a custom type that accepts either a string or a slice of strings in YAML. type StringOrSlice []string func (s *StringOrSlice) UnmarshalYAML(value *yaml.Node) error { var single string if err := value.Decode(&single); err == nil { *s = []string{single} return nil } var slice []string if err := value.Decode(&slice); err != nil { return fmt.Errorf("expected string or list of strings: %w", err) } *s = slice return nil } // FieldsListBlock preserves the field ordering from YAML map keys. type FieldsListBlock []FieldBlock func (f *FieldsListBlock) UnmarshalYAML(value *yaml.Node) error { // Cardigann fields are a YAML map where key is field name and value is selector block. // We use the yaml.Node directly to preserve key ordering. if value.Kind != yaml.MappingNode { return fmt.Errorf("fields must be a mapping") } result := make([]FieldBlock, 0, len(value.Content)/2) for i := 0; i < len(value.Content); i += 2 { keyNode := value.Content[i] valNode := value.Content[i+1] fieldName := keyNode.Value // Marshal the value node back to YAML, then unmarshal into SelectorBlock valueBytes, err := yaml.Marshal(valNode) if err != nil { return fmt.Errorf("failed to marshal field %q: %w", fieldName, err) } var block SelectorBlock if err := yaml.Unmarshal(valueBytes, &block); err != nil { return fmt.Errorf("failed to unmarshal field %q block: %w", fieldName, err) } result = append(result, FieldBlock{ Field: fieldName, Block: block, }) } *f = result return nil } // UnmarshalYAML sets default values for RowsBlock. func (r *RowsBlock) UnmarshalYAML(value *yaml.Node) error { // Use a raw type to avoid infinite recursion type rawRows struct { Selector string `yaml:"selector"` Remove string `yaml:"remove"` After int `yaml:"after"` DateHeaders SelectorBlock `yaml:"dateheaders"` } var raw rawRows if err := value.Decode(&raw); err != nil { return err } r.Selector = raw.Selector r.Remove = raw.Remove r.After = raw.After r.DateHeaders = raw.DateHeaders return nil } // UnmarshalYAML sets default values for LoginBlock. func (l *LoginBlock) UnmarshalYAML(value *yaml.Node) error { type rawLogin struct { Path string `yaml:"path"` Method string `yaml:"method"` Form string `yaml:"form"` Inputs map[string]string `yaml:"inputs"` Error []ErrorBlock `yaml:"error"` Test PageTestBlock `yaml:"test"` } var raw rawLogin if err := value.Decode(&raw); err != nil { return err } l.Path = raw.Path l.Method = raw.Method l.Form = raw.Form l.Inputs = raw.Inputs l.Error = raw.Error l.Test = raw.Test // Apply defaults if l.Method == "" { l.Method = "form" } if l.Form == "" { l.Form = "form" } return nil } // ParseDefinition parses raw YAML bytes into a Definition struct. // It applies defaults and validates required fields. func ParseDefinition(data []byte) (*Definition, error) { var def Definition if err := yaml.Unmarshal(data, &def); err != nil { return nil, fmt.Errorf("parse YAML: %w", err) } // Apply defaults if def.Language == "" { def.Language = "en-us" } if def.Encoding == "" { def.Encoding = "UTF-8" } // Validate required fields if def.Site == "" { return nil, fmt.Errorf("definition missing required field: site") } if def.Name == "" { return nil, fmt.Errorf("definition missing required field: name") } if len(def.Links) == 0 { return nil, fmt.Errorf("definition missing required field: links") } // Threat model T-10-04: Reject oversized definitions if len(def.Search.Fields) > 100 { return nil, fmt.Errorf("definition has too many search fields (%d > 100)", len(def.Search.Fields)) } if len(def.Caps.Categories) > 1000 { return nil, fmt.Errorf("definition has too many category mappings (%d > 1000)", len(def.Caps.Categories)) } return &def, nil } // ValidateDefinition returns a list of validation warnings for a parsed definition. // These are not errors — the definition may still be usable — but indicate potential issues. func ValidateDefinition(def *Definition) []string { var warnings []string if def.Search.Rows.Selector == "" { warnings = append(warnings, "search.rows.selector is empty — search will not find results") } hasTitle := false hasDownload := false for _, field := range def.Search.Fields { switch field.Field { case "title": hasTitle = true case "download": hasDownload = true } } if !hasTitle { warnings = append(warnings, "search.fields missing \"title\" field — results will have no title") } if !hasDownload { warnings = append(warnings, "search.fields missing \"download\" field — results will have no download URL") } // Check that login inputs reference config settings if len(def.Login.Inputs) > 0 && len(def.Settings) > 0 { settingNames := make(map[string]bool, len(def.Settings)) for _, s := range def.Settings { settingNames[s.Name] = true } } return warnings }