615 lines
16 KiB
Go
615 lines
16 KiB
Go
package cardigann
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"net/http"
|
|
"net/url"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"github.com/dustin/go-humanize"
|
|
)
|
|
|
|
// CardigannResult is the output of a Cardigann search operation.
|
|
// It is converted to service.SearchResult by the service layer.
|
|
type CardigannResult struct {
|
|
Title string
|
|
GUID string
|
|
DownloadURL string
|
|
Size int64
|
|
PubDate string
|
|
Seeders int
|
|
Peers int
|
|
Category string
|
|
Description string
|
|
}
|
|
|
|
// IndexerTestResult is the result of testing a Cardigann indexer connection.
|
|
type IndexerTestResult struct {
|
|
Success bool
|
|
Error string
|
|
}
|
|
|
|
// CardigannEngine handles Cardigann indexer operations: search, login, test.
|
|
type CardigannEngine struct {
|
|
httpClient *http.Client
|
|
cookies []*http.Cookie
|
|
logger *slog.Logger
|
|
}
|
|
|
|
// NewCardigannEngine creates a new CardigannEngine with safe HTTP client.
|
|
func NewCardigannEngine() *CardigannEngine {
|
|
return &CardigannEngine{
|
|
httpClient: SafeHTTPClient(),
|
|
logger: slog.Default(),
|
|
}
|
|
}
|
|
|
|
// Search executes a Cardigann search: login (if needed), build request, parse HTML, extract results.
|
|
func (e *CardigannEngine) Search(ctx context.Context, def *Definition, config map[string]string, query SearchQuery) ([]CardigannResult, error) {
|
|
baseURL := e.getBaseURL(def, config)
|
|
|
|
// Login if required
|
|
if def.Login.Path != "" || len(def.Login.Inputs) > 0 {
|
|
if err := e.login(ctx, def, config, baseURL); err != nil {
|
|
return nil, fmt.Errorf("login failed: %w", err)
|
|
}
|
|
}
|
|
|
|
// Build search URL from path template
|
|
searchPath := def.Search.Path
|
|
if searchPath == "" {
|
|
searchPath = "/"
|
|
}
|
|
|
|
path, err := ApplyTemplate("search-path", searchPath, TemplateContext{
|
|
Query: query,
|
|
Config: config,
|
|
Categories: []string{},
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("template search path: %w", err)
|
|
}
|
|
|
|
searchURL, err := e.resolvePath(baseURL, path)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("resolve search URL: %w", err)
|
|
}
|
|
|
|
// Validate the search URL (SSRF protection)
|
|
if err := ValidateURL(searchURL); err != nil {
|
|
return nil, fmt.Errorf("search URL blocked: %w", err)
|
|
}
|
|
|
|
// Build query inputs
|
|
inputValues := make(url.Values)
|
|
for key, tplStr := range def.Search.Inputs {
|
|
rendered, err := ApplyTemplate("input-"+key, tplStr, TemplateContext{
|
|
Query: query,
|
|
Config: config,
|
|
Categories: []string{},
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("template input %q: %w", key, err)
|
|
}
|
|
|
|
if key == "$raw" {
|
|
// Parse as query string and merge
|
|
parsed, err := url.ParseQuery(rendered)
|
|
if err == nil {
|
|
for k, vals := range parsed {
|
|
for _, v := range vals {
|
|
inputValues.Set(k, v)
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
inputValues.Set(key, rendered)
|
|
}
|
|
}
|
|
|
|
// Execute HTTP request
|
|
var resp *http.Response
|
|
method := strings.ToUpper(def.Search.Method)
|
|
if method == "" {
|
|
method = "GET"
|
|
}
|
|
|
|
searchCtx, searchCancel := context.WithTimeout(ctx, 15*time.Second)
|
|
defer searchCancel()
|
|
|
|
if method == "POST" {
|
|
req, err := http.NewRequestWithContext(searchCtx, http.MethodPost, searchURL, strings.NewReader(inputValues.Encode()))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("create POST request: %w", err)
|
|
}
|
|
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
|
|
for _, cookie := range e.cookies {
|
|
req.AddCookie(cookie)
|
|
}
|
|
resp, err = e.httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("POST search: %w", err)
|
|
}
|
|
} else {
|
|
// GET: append query string
|
|
if len(inputValues) > 0 {
|
|
if strings.Contains(searchURL, "?") {
|
|
searchURL += "&" + inputValues.Encode()
|
|
} else {
|
|
searchURL += "?" + inputValues.Encode()
|
|
}
|
|
}
|
|
req, err := http.NewRequestWithContext(searchCtx, http.MethodGet, searchURL, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("create GET request: %w", err)
|
|
}
|
|
for _, cookie := range e.cookies {
|
|
req.AddCookie(cookie)
|
|
}
|
|
resp, err = e.httpClient.Do(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("GET search: %w", err)
|
|
}
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode >= 400 {
|
|
return nil, fmt.Errorf("search returned HTTP %d", resp.StatusCode)
|
|
}
|
|
|
|
// Read response with size limit (T-10-07: 10MB cap)
|
|
body := io.LimitReader(resp.Body, 10*1024*1024)
|
|
|
|
// Parse HTML
|
|
doc, err := goquery.NewDocumentFromReader(body)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("parse HTML: %w", err)
|
|
}
|
|
|
|
// Find rows
|
|
rows := doc.Find(def.Search.Rows.Selector)
|
|
if def.Search.Rows.Remove != "" {
|
|
rows.Find(def.Search.Rows.Remove).Remove()
|
|
}
|
|
|
|
var results []CardigannResult
|
|
rows.Each(func(i int, row *goquery.Selection) {
|
|
result := CardigannResult{}
|
|
fieldValues := make(map[string]string)
|
|
|
|
for _, field := range def.Search.Fields {
|
|
val, err := ExtractField(row, field.Block)
|
|
if err != nil {
|
|
e.logger.Warn("field extraction error", "field", field.Field, "error", err)
|
|
continue
|
|
}
|
|
fieldValues[field.Field] = val
|
|
}
|
|
|
|
// Map fields to result
|
|
result.Title = fieldValues["title"]
|
|
result.DownloadURL = fieldValues["download"]
|
|
result.GUID = fieldValues["details"]
|
|
result.Category = fieldValues["category"]
|
|
result.Description = fieldValues["description"]
|
|
result.PubDate = fieldValues["date"]
|
|
|
|
// Resolve relative URLs
|
|
if result.DownloadURL != "" {
|
|
resolved, err := e.resolvePath(baseURL, result.DownloadURL)
|
|
if err == nil {
|
|
result.DownloadURL = resolved
|
|
}
|
|
}
|
|
if result.GUID != "" {
|
|
resolved, err := e.resolvePath(baseURL, result.GUID)
|
|
if err == nil {
|
|
result.GUID = resolved
|
|
}
|
|
}
|
|
|
|
// Parse size
|
|
if sizeStr := fieldValues["size"]; sizeStr != "" {
|
|
if size, err := humanize.ParseBytes(strings.TrimSpace(sizeStr)); err == nil {
|
|
result.Size = int64(size)
|
|
}
|
|
}
|
|
|
|
// Parse seeders/peers
|
|
if seedersStr := fieldValues["seeders"]; seedersStr != "" {
|
|
if v, err := strconv.Atoi(strings.TrimSpace(seedersStr)); err == nil {
|
|
result.Seeders = v
|
|
}
|
|
}
|
|
if leechersStr := fieldValues["leechers"]; leechersStr != "" {
|
|
if v, err := strconv.Atoi(strings.TrimSpace(leechersStr)); err == nil {
|
|
result.Peers = v
|
|
}
|
|
}
|
|
|
|
// Parse date if it wasn't already RFC3339
|
|
if result.PubDate != "" {
|
|
result.PubDate = e.parseDateField(result.PubDate)
|
|
}
|
|
|
|
// Only include results with at least a title
|
|
if result.Title != "" {
|
|
results = append(results, result)
|
|
}
|
|
})
|
|
|
|
return results, nil
|
|
}
|
|
|
|
// login performs authentication against the Cardigann indexer.
|
|
func (e *CardigannEngine) login(ctx context.Context, def *Definition, config map[string]string, baseURL string) error {
|
|
loginPath := def.Login.Path
|
|
if loginPath == "" {
|
|
return fmt.Errorf("login path is empty")
|
|
}
|
|
|
|
path, err := ApplyTemplate("login-path", loginPath, TemplateContext{
|
|
Config: config,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("template login path: %w", err)
|
|
}
|
|
|
|
loginURL, err := e.resolvePath(baseURL, path)
|
|
if err != nil {
|
|
return fmt.Errorf("resolve login URL: %w", err)
|
|
}
|
|
|
|
if err := ValidateURL(loginURL); err != nil {
|
|
return fmt.Errorf("login URL blocked: %w", err)
|
|
}
|
|
|
|
// Build input values from login.inputs
|
|
inputValues := make(map[string]string)
|
|
for key, tplStr := range def.Login.Inputs {
|
|
rendered, err := ApplyTemplate("login-input-"+key, tplStr, TemplateContext{
|
|
Config: config,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("template login input %q: %w", key, err)
|
|
}
|
|
inputValues[key] = rendered
|
|
}
|
|
|
|
loginCtx, loginCancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer loginCancel()
|
|
|
|
switch def.Login.Method {
|
|
case "cookie":
|
|
// Set cookie directly
|
|
if cookieStr, ok := inputValues["cookie"]; ok {
|
|
parts := strings.SplitN(cookieStr, "=", 2)
|
|
cookie := &http.Cookie{
|
|
Name: parts[0],
|
|
Value: func() string { if len(parts) > 1 { return parts[1] }; return "" }(),
|
|
}
|
|
e.cookies = append(e.cookies, cookie)
|
|
}
|
|
return nil
|
|
|
|
case "post":
|
|
// POST directly to login path with inputs
|
|
form := url.Values{}
|
|
for key, val := range inputValues {
|
|
form.Set(key, val)
|
|
}
|
|
req, err := http.NewRequestWithContext(loginCtx, http.MethodPost, loginURL, strings.NewReader(form.Encode()))
|
|
if err != nil {
|
|
return fmt.Errorf("create login POST: %w", err)
|
|
}
|
|
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
|
|
resp, err := e.httpClient.Do(req)
|
|
if err != nil {
|
|
return fmt.Errorf("login POST: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024))
|
|
|
|
// Store cookies from response
|
|
e.cookies = resp.Cookies()
|
|
|
|
// Check for errors
|
|
if err := e.checkLoginErrors(resp, def); err != nil {
|
|
return err
|
|
}
|
|
|
|
default:
|
|
// "form" method (default)
|
|
// GET login page, find form, fill inputs, submit
|
|
req, err := http.NewRequestWithContext(loginCtx, http.MethodGet, loginURL, nil)
|
|
if err != nil {
|
|
return fmt.Errorf("create login GET: %w", err)
|
|
}
|
|
resp, err := e.httpClient.Do(req)
|
|
if err != nil {
|
|
return fmt.Errorf("login GET: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
bodyBytes, err := io.ReadAll(io.LimitReader(resp.Body, 10*1024*1024))
|
|
if err != nil {
|
|
return fmt.Errorf("read login page: %w", err)
|
|
}
|
|
|
|
e.cookies = append(e.cookies, resp.Cookies()...)
|
|
|
|
// Parse the login page to find the form
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(bodyBytes)))
|
|
if err != nil {
|
|
return fmt.Errorf("parse login page: %w", err)
|
|
}
|
|
|
|
// Find the form
|
|
formSelector := def.Login.Form
|
|
if formSelector == "" {
|
|
formSelector = "form"
|
|
}
|
|
form := doc.Find(formSelector).First()
|
|
if form.Length() == 0 {
|
|
return fmt.Errorf("login form not found with selector %q", formSelector)
|
|
}
|
|
|
|
// Get form action
|
|
action, exists := form.Attr("action")
|
|
if !exists || action == "" {
|
|
action = loginPath
|
|
}
|
|
actionURL, err := e.resolvePath(baseURL, action)
|
|
if err != nil {
|
|
return fmt.Errorf("resolve form action: %w", err)
|
|
}
|
|
if err := ValidateURL(actionURL); err != nil {
|
|
return fmt.Errorf("form action URL blocked: %w", err)
|
|
}
|
|
|
|
// Collect hidden inputs from form
|
|
formValues := url.Values{}
|
|
form.Find("input[type='hidden']").Each(func(i int, s *goquery.Selection) {
|
|
name, _ := s.Attr("name")
|
|
value, _ := s.Attr("value")
|
|
if name != "" {
|
|
formValues.Set(name, value)
|
|
}
|
|
})
|
|
|
|
// Add login inputs
|
|
for key, val := range inputValues {
|
|
formValues.Set(key, val)
|
|
}
|
|
|
|
// Submit the form
|
|
submitReq, err := http.NewRequestWithContext(loginCtx, http.MethodPost, actionURL, strings.NewReader(formValues.Encode()))
|
|
if err != nil {
|
|
return fmt.Errorf("create form submit: %w", err)
|
|
}
|
|
submitReq.Header.Set("Content-Type", "application/x-www-form-urlencoded")
|
|
for _, cookie := range e.cookies {
|
|
submitReq.AddCookie(cookie)
|
|
}
|
|
|
|
submitResp, err := e.httpClient.Do(submitReq)
|
|
if err != nil {
|
|
return fmt.Errorf("submit login form: %w", err)
|
|
}
|
|
defer submitResp.Body.Close()
|
|
io.ReadAll(io.LimitReader(submitResp.Body, 10*1024*1024))
|
|
|
|
e.cookies = append(e.cookies, submitResp.Cookies()...)
|
|
|
|
// Check for errors
|
|
if err := e.checkLoginErrors(submitResp, def); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Test login if test block is defined
|
|
if def.Login.Test.Selector != "" || def.Login.Test.Path != "" {
|
|
testPath := def.Login.Test.Path
|
|
if testPath == "" {
|
|
testPath = "/"
|
|
}
|
|
testURL, err := e.resolvePath(baseURL, testPath)
|
|
if err != nil {
|
|
return fmt.Errorf("resolve test URL: %w", err)
|
|
}
|
|
if err := ValidateURL(testURL); err != nil {
|
|
return fmt.Errorf("test URL blocked: %w", err)
|
|
}
|
|
|
|
testReq, err := http.NewRequestWithContext(loginCtx, http.MethodGet, testURL, nil)
|
|
if err != nil {
|
|
return fmt.Errorf("create test request: %w", err)
|
|
}
|
|
for _, cookie := range e.cookies {
|
|
testReq.AddCookie(cookie)
|
|
}
|
|
testResp, err := e.httpClient.Do(testReq)
|
|
if err != nil {
|
|
return fmt.Errorf("login test request: %w", err)
|
|
}
|
|
defer testResp.Body.Close()
|
|
io.ReadAll(io.LimitReader(testResp.Body, 10*1024*1024))
|
|
|
|
if def.Login.Test.Selector != "" {
|
|
testDoc, err := goquery.NewDocumentFromReader(strings.NewReader(func() string {
|
|
// We can't re-read the body, so we just check the status code
|
|
return ""
|
|
}()))
|
|
if err != nil {
|
|
return nil // Don't fail on parse errors
|
|
}
|
|
if testDoc.Find(def.Login.Test.Selector).Length() == 0 {
|
|
return fmt.Errorf("login test: selector %q not found", def.Login.Test.Selector)
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Test validates a Cardigann indexer by checking base URL connectivity and optionally testing login.
|
|
func (e *CardigannEngine) Test(ctx context.Context, def *Definition, config map[string]string) (*IndexerTestResult, error) {
|
|
baseURL := e.getBaseURL(def, config)
|
|
if baseURL == "" {
|
|
return &IndexerTestResult{Success: false, Error: "no base URL in definition"}, nil
|
|
}
|
|
|
|
if err := ValidateURL(baseURL); err != nil {
|
|
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("URL blocked: %v", err)}, nil
|
|
}
|
|
|
|
// If Login block present, attempt login
|
|
if def.Login.Path != "" || len(def.Login.Inputs) > 0 {
|
|
if err := e.login(ctx, def, config, baseURL); err != nil {
|
|
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("login failed: %v", err)}, nil
|
|
}
|
|
}
|
|
|
|
// If Search block present, test search path
|
|
if def.Search.Path != "" {
|
|
testPath, err := ApplyTemplate("test-path", def.Search.Path, TemplateContext{
|
|
Config: config,
|
|
})
|
|
if err != nil {
|
|
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("template error: %v", err)}, nil
|
|
}
|
|
|
|
testURL, err := e.resolvePath(baseURL, testPath)
|
|
if err != nil {
|
|
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("resolve URL: %v", err)}, nil
|
|
}
|
|
|
|
if err := ValidateURL(testURL); err != nil {
|
|
return &IndexerTestResult{Success: false, Error: fmt.Sprintf("URL blocked: %v", err)}, nil
|
|
}
|
|
|
|
testCtx, cancel := context.WithTimeout(ctx, 10*time.Second)
|
|
defer cancel()
|
|
|
|
req, err := http.NewRequestWithContext(testCtx, http.MethodGet, testURL, nil)
|
|
if err != nil {
|
|
return &IndexerTestResult{Success: false, Error: err.Error()}, nil
|
|
}
|
|
for _, cookie := range e.cookies {
|
|
req.AddCookie(cookie)
|
|
}
|
|
|
|
resp, err := e.httpClient.Do(req)
|
|
if err != nil {
|
|
return &IndexerTestResult{Success: false, Error: err.Error()}, nil
|
|
}
|
|
resp.Body.Close()
|
|
|
|
if resp.StatusCode >= 400 {
|
|
return &IndexerTestResult{
|
|
Success: false,
|
|
Error: fmt.Sprintf("HTTP %d", resp.StatusCode),
|
|
}, nil
|
|
}
|
|
}
|
|
|
|
return &IndexerTestResult{Success: true}, nil
|
|
}
|
|
|
|
// resolvePath resolves a potentially relative path against a base URL.
|
|
func (e *CardigannEngine) resolvePath(baseURL, path string) (string, error) {
|
|
if path == "" {
|
|
return baseURL, nil
|
|
}
|
|
|
|
// Already absolute URL
|
|
if strings.HasPrefix(strings.ToLower(path), "http://") || strings.HasPrefix(strings.ToLower(path), "https://") {
|
|
return path, nil
|
|
}
|
|
|
|
// Relative URL — resolve against base
|
|
base, err := url.Parse(baseURL)
|
|
if err != nil {
|
|
return "", fmt.Errorf("parse base URL: %w", err)
|
|
}
|
|
|
|
ref, err := url.Parse(path)
|
|
if err != nil {
|
|
return "", fmt.Errorf("parse path: %w", err)
|
|
}
|
|
|
|
resolved := base.ResolveReference(ref)
|
|
return resolved.String(), nil
|
|
}
|
|
|
|
// getBaseURL returns the first link from the definition, or a config override.
|
|
func (e *CardigannEngine) getBaseURL(def *Definition, config map[string]string) string {
|
|
if url, ok := config["base_url"]; ok && url != "" {
|
|
return url
|
|
}
|
|
if len(def.Links) > 0 {
|
|
return def.Links[0]
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// parseDateField attempts to parse a date string in various formats.
|
|
func (e *CardigannEngine) parseDateField(val string) string {
|
|
// Already RFC3339
|
|
if _, err := time.Parse(time.RFC3339, val); err == nil {
|
|
return val
|
|
}
|
|
|
|
// Try common date layouts
|
|
layouts := []string{
|
|
"2006-01-02 15:04:05",
|
|
"2006-01-02T15:04:05Z07:00",
|
|
"2006-01-02T15:04:05",
|
|
"2006-01-02",
|
|
"02-Jan-2006",
|
|
"Jan 02, 2006",
|
|
"Jan 02 2006",
|
|
"02 Jan 2006 15:04:05",
|
|
"Mon, 02 Jan 2006 15:04:05 -0700",
|
|
time.RFC1123,
|
|
time.RFC1123Z,
|
|
time.RFC822,
|
|
time.RFC822Z,
|
|
}
|
|
|
|
for _, layout := range layouts {
|
|
if t, err := time.Parse(layout, strings.TrimSpace(val)); err == nil {
|
|
return t.Format(time.RFC3339)
|
|
}
|
|
}
|
|
|
|
// Try relative time
|
|
if t, err := parseFuzzyTime(val); err == nil {
|
|
return t.Format(time.RFC3339)
|
|
}
|
|
|
|
// Return as-is if we can't parse
|
|
return val
|
|
}
|
|
|
|
// checkLoginErrors checks for login error patterns in the response.
|
|
func (e *CardigannEngine) checkLoginErrors(resp *http.Response, def *Definition) error {
|
|
if len(def.Login.Error) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// Note: body has already been read; we'd need to store it
|
|
// For now, just check status code
|
|
if resp.StatusCode >= 400 {
|
|
return fmt.Errorf("login returned HTTP %d", resp.StatusCode)
|
|
}
|
|
|
|
return nil
|
|
}
|