288 lines
8.0 KiB
Go
288 lines
8.0 KiB
Go
package cardigann
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
yaml "gopkg.in/yaml.v3"
|
|
)
|
|
|
|
// Definition represents a parsed Cardigann YAML indexer definition.
|
|
// It matches the upstream Cardigann schema for site definitions.
|
|
type Definition struct {
|
|
Site string `yaml:"site"`
|
|
Name string `yaml:"name"`
|
|
Description string `yaml:"description"`
|
|
Language string `yaml:"language"`
|
|
Encoding string `yaml:"encoding"`
|
|
Links StringOrSlice `yaml:"links"`
|
|
Settings []SettingsField `yaml:"settings"`
|
|
Caps CapabilitiesBlock `yaml:"caps"`
|
|
Login LoginBlock `yaml:"login"`
|
|
Ratio RatioBlock `yaml:"ratio"`
|
|
Search SearchBlock `yaml:"search"`
|
|
}
|
|
|
|
// SettingsField describes a user-configurable field in the definition.
|
|
type SettingsField struct {
|
|
Name string `yaml:"name"`
|
|
Type string `yaml:"type"`
|
|
Label string `yaml:"label"`
|
|
}
|
|
|
|
// CapabilitiesBlock maps categories and search modes.
|
|
type CapabilitiesBlock struct {
|
|
Categories map[string]string `yaml:"categories"`
|
|
Modes map[string][]string `yaml:"modes"`
|
|
}
|
|
|
|
// LoginBlock describes authentication configuration.
|
|
type LoginBlock struct {
|
|
Path string `yaml:"path"`
|
|
Method string `yaml:"method"`
|
|
Form string `yaml:"form"`
|
|
Inputs map[string]string `yaml:"inputs"`
|
|
Error []ErrorBlock `yaml:"error"`
|
|
Test PageTestBlock `yaml:"test"`
|
|
}
|
|
|
|
// ErrorBlock describes an error detection pattern.
|
|
type ErrorBlock struct {
|
|
Path string `yaml:"path"`
|
|
Selector string `yaml:"selector"`
|
|
Message SelectorBlock `yaml:"message"`
|
|
}
|
|
|
|
// PageTestBlock describes a page test for verifying login.
|
|
type PageTestBlock struct {
|
|
Path string `yaml:"path"`
|
|
Selector string `yaml:"selector"`
|
|
}
|
|
|
|
// SearchBlock describes search configuration.
|
|
type SearchBlock struct {
|
|
Path string `yaml:"path"`
|
|
Method string `yaml:"method"`
|
|
Inputs map[string]string `yaml:"inputs"`
|
|
Rows RowsBlock `yaml:"rows"`
|
|
Fields FieldsListBlock `yaml:"fields"`
|
|
}
|
|
|
|
// RowsBlock describes how to find result rows in HTML.
|
|
type RowsBlock struct {
|
|
Selector string `yaml:"selector"`
|
|
Remove string `yaml:"remove"`
|
|
After int `yaml:"after"`
|
|
DateHeaders SelectorBlock `yaml:"dateheaders"`
|
|
}
|
|
|
|
// FieldBlock represents a single field extraction definition.
|
|
type FieldBlock struct {
|
|
Field string `yaml:"field"`
|
|
Block SelectorBlock `yaml:"-"`
|
|
}
|
|
|
|
// SelectorBlock describes CSS selector extraction with optional filters.
|
|
type SelectorBlock struct {
|
|
Selector string `yaml:"selector"`
|
|
Text string `yaml:"text"`
|
|
Attribute string `yaml:"attribute"`
|
|
Remove string `yaml:"remove"`
|
|
Filters []FilterBlock `yaml:"filters"`
|
|
Case map[string]string `yaml:"case"`
|
|
}
|
|
|
|
// FilterBlock represents a filter transformation.
|
|
type FilterBlock struct {
|
|
Name string `yaml:"name"`
|
|
Args interface{} `yaml:"args"`
|
|
}
|
|
|
|
// RatioBlock describes ratio display configuration.
|
|
type RatioBlock struct {
|
|
Selector string `yaml:"selector"`
|
|
Path string `yaml:"path"`
|
|
}
|
|
|
|
// StringOrSlice is a custom type that accepts either a string or a slice of strings in YAML.
|
|
type StringOrSlice []string
|
|
|
|
func (s *StringOrSlice) UnmarshalYAML(value *yaml.Node) error {
|
|
var single string
|
|
if err := value.Decode(&single); err == nil {
|
|
*s = []string{single}
|
|
return nil
|
|
}
|
|
var slice []string
|
|
if err := value.Decode(&slice); err != nil {
|
|
return fmt.Errorf("expected string or list of strings: %w", err)
|
|
}
|
|
*s = slice
|
|
return nil
|
|
}
|
|
|
|
// FieldsListBlock preserves the field ordering from YAML map keys.
|
|
type FieldsListBlock []FieldBlock
|
|
|
|
func (f *FieldsListBlock) UnmarshalYAML(value *yaml.Node) error {
|
|
// Cardigann fields are a YAML map where key is field name and value is selector block.
|
|
// We use the yaml.Node directly to preserve key ordering.
|
|
if value.Kind != yaml.MappingNode {
|
|
return fmt.Errorf("fields must be a mapping")
|
|
}
|
|
|
|
result := make([]FieldBlock, 0, len(value.Content)/2)
|
|
for i := 0; i < len(value.Content); i += 2 {
|
|
keyNode := value.Content[i]
|
|
valNode := value.Content[i+1]
|
|
|
|
fieldName := keyNode.Value
|
|
|
|
// Marshal the value node back to YAML, then unmarshal into SelectorBlock
|
|
valueBytes, err := yaml.Marshal(valNode)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal field %q: %w", fieldName, err)
|
|
}
|
|
|
|
var block SelectorBlock
|
|
if err := yaml.Unmarshal(valueBytes, &block); err != nil {
|
|
return fmt.Errorf("failed to unmarshal field %q block: %w", fieldName, err)
|
|
}
|
|
|
|
result = append(result, FieldBlock{
|
|
Field: fieldName,
|
|
Block: block,
|
|
})
|
|
}
|
|
|
|
*f = result
|
|
return nil
|
|
}
|
|
|
|
// UnmarshalYAML sets default values for RowsBlock.
|
|
func (r *RowsBlock) UnmarshalYAML(value *yaml.Node) error {
|
|
// Use a raw type to avoid infinite recursion
|
|
type rawRows struct {
|
|
Selector string `yaml:"selector"`
|
|
Remove string `yaml:"remove"`
|
|
After int `yaml:"after"`
|
|
DateHeaders SelectorBlock `yaml:"dateheaders"`
|
|
}
|
|
var raw rawRows
|
|
if err := value.Decode(&raw); err != nil {
|
|
return err
|
|
}
|
|
r.Selector = raw.Selector
|
|
r.Remove = raw.Remove
|
|
r.After = raw.After
|
|
r.DateHeaders = raw.DateHeaders
|
|
return nil
|
|
}
|
|
|
|
// UnmarshalYAML sets default values for LoginBlock.
|
|
func (l *LoginBlock) UnmarshalYAML(value *yaml.Node) error {
|
|
type rawLogin struct {
|
|
Path string `yaml:"path"`
|
|
Method string `yaml:"method"`
|
|
Form string `yaml:"form"`
|
|
Inputs map[string]string `yaml:"inputs"`
|
|
Error []ErrorBlock `yaml:"error"`
|
|
Test PageTestBlock `yaml:"test"`
|
|
}
|
|
var raw rawLogin
|
|
if err := value.Decode(&raw); err != nil {
|
|
return err
|
|
}
|
|
|
|
l.Path = raw.Path
|
|
l.Method = raw.Method
|
|
l.Form = raw.Form
|
|
l.Inputs = raw.Inputs
|
|
l.Error = raw.Error
|
|
l.Test = raw.Test
|
|
|
|
// Apply defaults
|
|
if l.Method == "" {
|
|
l.Method = "form"
|
|
}
|
|
if l.Form == "" {
|
|
l.Form = "form"
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// ParseDefinition parses raw YAML bytes into a Definition struct.
|
|
// It applies defaults and validates required fields.
|
|
func ParseDefinition(data []byte) (*Definition, error) {
|
|
var def Definition
|
|
if err := yaml.Unmarshal(data, &def); err != nil {
|
|
return nil, fmt.Errorf("parse YAML: %w", err)
|
|
}
|
|
|
|
// Apply defaults
|
|
if def.Language == "" {
|
|
def.Language = "en-us"
|
|
}
|
|
if def.Encoding == "" {
|
|
def.Encoding = "UTF-8"
|
|
}
|
|
|
|
// Validate required fields
|
|
if def.Site == "" {
|
|
return nil, fmt.Errorf("definition missing required field: site")
|
|
}
|
|
if def.Name == "" {
|
|
return nil, fmt.Errorf("definition missing required field: name")
|
|
}
|
|
if len(def.Links) == 0 {
|
|
return nil, fmt.Errorf("definition missing required field: links")
|
|
}
|
|
|
|
// Threat model T-10-04: Reject oversized definitions
|
|
if len(def.Search.Fields) > 100 {
|
|
return nil, fmt.Errorf("definition has too many search fields (%d > 100)", len(def.Search.Fields))
|
|
}
|
|
if len(def.Caps.Categories) > 1000 {
|
|
return nil, fmt.Errorf("definition has too many category mappings (%d > 1000)", len(def.Caps.Categories))
|
|
}
|
|
|
|
return &def, nil
|
|
}
|
|
|
|
// ValidateDefinition returns a list of validation warnings for a parsed definition.
|
|
// These are not errors — the definition may still be usable — but indicate potential issues.
|
|
func ValidateDefinition(def *Definition) []string {
|
|
var warnings []string
|
|
|
|
if def.Search.Rows.Selector == "" {
|
|
warnings = append(warnings, "search.rows.selector is empty — search will not find results")
|
|
}
|
|
|
|
hasTitle := false
|
|
hasDownload := false
|
|
for _, field := range def.Search.Fields {
|
|
switch field.Field {
|
|
case "title":
|
|
hasTitle = true
|
|
case "download":
|
|
hasDownload = true
|
|
}
|
|
}
|
|
|
|
if !hasTitle {
|
|
warnings = append(warnings, "search.fields missing \"title\" field — results will have no title")
|
|
}
|
|
if !hasDownload {
|
|
warnings = append(warnings, "search.fields missing \"download\" field — results will have no download URL")
|
|
}
|
|
|
|
// Check that login inputs reference config settings
|
|
if len(def.Login.Inputs) > 0 && len(def.Settings) > 0 {
|
|
settingNames := make(map[string]bool, len(def.Settings))
|
|
for _, s := range def.Settings {
|
|
settingNames[s.Name] = true
|
|
}
|
|
}
|
|
|
|
return warnings
|
|
}
|