feat: add homelab configuration and transcription support
Some checks failed
CI / Detect Change Scope (push) Has been cancelled
CI / Format & Lint (push) Has been cancelled
CI / Lint Strict Delta (push) Has been cancelled
CI / Test (push) Has been cancelled
CI / Build (Smoke) (push) Has been cancelled
CI / Docs-Only Fast Path (push) Has been cancelled
CI / Non-Rust Fast Path (push) Has been cancelled
CI / Docs Quality (push) Has been cancelled
CI / CI Required Gate (push) Has been cancelled
Docker / PR Docker Smoke (push) Has been cancelled
Docker / Build and Push Docker Image (push) Has been cancelled
Rust Package Security Audit / Security Audit (push) Has been cancelled
Rust Package Security Audit / License & Supply Chain (push) Has been cancelled
Some checks failed
CI / Detect Change Scope (push) Has been cancelled
CI / Format & Lint (push) Has been cancelled
CI / Lint Strict Delta (push) Has been cancelled
CI / Test (push) Has been cancelled
CI / Build (Smoke) (push) Has been cancelled
CI / Docs-Only Fast Path (push) Has been cancelled
CI / Non-Rust Fast Path (push) Has been cancelled
CI / Docs Quality (push) Has been cancelled
CI / CI Required Gate (push) Has been cancelled
Docker / PR Docker Smoke (push) Has been cancelled
Docker / Build and Push Docker Image (push) Has been cancelled
Rust Package Security Audit / Security Audit (push) Has been cancelled
Rust Package Security Audit / License & Supply Chain (push) Has been cancelled
- Add host-based delegate agents (ubuntu, grizzley, truenas, panda, pve) - Add functional delegates (coder, reasoner, research, quick) - Add NanoGPT provider with minimax-m2.5 model - Add transcribe tool using faster-whisper - Update TelegramChannel with workspace_dir - Configure Z.AI as default provider with glm-5
This commit is contained in:
@@ -633,6 +633,7 @@ pub async fn doctor_channels(config: Config) -> Result<()> {
|
|||||||
Arc::new(TelegramChannel::new(
|
Arc::new(TelegramChannel::new(
|
||||||
tg.bot_token.clone(),
|
tg.bot_token.clone(),
|
||||||
tg.allowed_users.clone(),
|
tg.allowed_users.clone(),
|
||||||
|
Some(config.workspace_dir.clone()),
|
||||||
)),
|
)),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
@@ -923,6 +924,7 @@ pub async fn start_channels(config: Config) -> Result<()> {
|
|||||||
channels.push(Arc::new(TelegramChannel::new(
|
channels.push(Arc::new(TelegramChannel::new(
|
||||||
tg.bot_token.clone(),
|
tg.bot_token.clone(),
|
||||||
tg.allowed_users.clone(),
|
tg.allowed_users.clone(),
|
||||||
|
Some(config.workspace_dir.clone()),
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -179,18 +179,19 @@ fn parse_attachment_markers(message: &str) -> (String, Vec<TelegramAttachment>)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Telegram channel — long-polls the Bot API for updates
|
/// Telegram channel — long-polls the Bot API for updates
|
||||||
pub struct TelegramChannel {
|
pub struct TelegramChannel { workspace_dir: std::path::PathBuf,
|
||||||
bot_token: String,
|
bot_token: String,
|
||||||
allowed_users: Vec<String>,
|
allowed_users: Vec<String>,
|
||||||
client: reqwest::Client,
|
client: reqwest::Client,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TelegramChannel {
|
impl TelegramChannel {
|
||||||
pub fn new(bot_token: String, allowed_users: Vec<String>) -> Self {
|
pub fn new(bot_token: String, allowed_users: Vec<String>, workspace_dir: Option<std::path::PathBuf>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
bot_token,
|
bot_token,
|
||||||
allowed_users,
|
allowed_users,
|
||||||
client: reqwest::Client::new(),
|
client: reqwest::Client::new(),
|
||||||
|
workspace_dir: workspace_dir.unwrap_or_else(|| std::path::PathBuf::from("/tmp")),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -231,7 +231,7 @@ async fn deliver_if_configured(config: &Config, job: &CronJob, output: &str) ->
|
|||||||
.telegram
|
.telegram
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.ok_or_else(|| anyhow::anyhow!("telegram channel not configured"))?;
|
.ok_or_else(|| anyhow::anyhow!("telegram channel not configured"))?;
|
||||||
let channel = TelegramChannel::new(tg.bot_token.clone(), tg.allowed_users.clone());
|
let channel = TelegramChannel::new(tg.bot_token.clone(), tg.allowed_users.clone(), Some(config.workspace_dir.clone()));
|
||||||
channel.send(output, target).await?;
|
channel.send(output, target).await?;
|
||||||
}
|
}
|
||||||
"discord" => {
|
"discord" => {
|
||||||
|
|||||||
@@ -135,6 +135,7 @@ fn resolve_provider_credential(name: &str, credential_override: Option<&str>) ->
|
|||||||
"zai" | "z.ai" => vec!["ZAI_API_KEY"],
|
"zai" | "z.ai" => vec!["ZAI_API_KEY"],
|
||||||
"nvidia" | "nvidia-nim" | "build.nvidia.com" => vec!["NVIDIA_API_KEY"],
|
"nvidia" | "nvidia-nim" | "build.nvidia.com" => vec!["NVIDIA_API_KEY"],
|
||||||
"synthetic" => vec!["SYNTHETIC_API_KEY"],
|
"synthetic" => vec!["SYNTHETIC_API_KEY"],
|
||||||
|
"nanogpt" | "nano-gpt" => vec!["NANO_GPT_API_KEY"],
|
||||||
"opencode" | "opencode-zen" => vec!["OPENCODE_API_KEY"],
|
"opencode" | "opencode-zen" => vec!["OPENCODE_API_KEY"],
|
||||||
"vercel" | "vercel-ai" => vec!["VERCEL_API_KEY"],
|
"vercel" | "vercel-ai" => vec!["VERCEL_API_KEY"],
|
||||||
"cloudflare" | "cloudflare-ai" => vec!["CLOUDFLARE_API_KEY"],
|
"cloudflare" | "cloudflare-ai" => vec!["CLOUDFLARE_API_KEY"],
|
||||||
@@ -246,6 +247,12 @@ pub fn create_provider_with_url(
|
|||||||
key,
|
key,
|
||||||
AuthStyle::Bearer,
|
AuthStyle::Bearer,
|
||||||
))),
|
))),
|
||||||
|
"nanogpt" | "nano-gpt" => Ok(Box::new(OpenAiCompatibleProvider::new(
|
||||||
|
"NanoGPT",
|
||||||
|
"https://nano-gpt.com/api/v1",
|
||||||
|
key,
|
||||||
|
AuthStyle::Bearer,
|
||||||
|
))),
|
||||||
"bedrock" | "aws-bedrock" => Ok(Box::new(OpenAiCompatibleProvider::new(
|
"bedrock" | "aws-bedrock" => Ok(Box::new(OpenAiCompatibleProvider::new(
|
||||||
"Amazon Bedrock",
|
"Amazon Bedrock",
|
||||||
"https://bedrock-runtime.us-east-1.amazonaws.com",
|
"https://bedrock-runtime.us-east-1.amazonaws.com",
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ pub mod schema;
|
|||||||
pub mod screenshot;
|
pub mod screenshot;
|
||||||
pub mod shell;
|
pub mod shell;
|
||||||
pub mod traits;
|
pub mod traits;
|
||||||
|
pub mod transcribe;
|
||||||
|
|
||||||
pub use browser::{BrowserTool, ComputerUseConfig};
|
pub use browser::{BrowserTool, ComputerUseConfig};
|
||||||
pub use browser_open::BrowserOpenTool;
|
pub use browser_open::BrowserOpenTool;
|
||||||
@@ -53,6 +54,7 @@ pub use schema::{CleaningStrategy, SchemaCleanr};
|
|||||||
pub use screenshot::ScreenshotTool;
|
pub use screenshot::ScreenshotTool;
|
||||||
pub use shell::ShellTool;
|
pub use shell::ShellTool;
|
||||||
pub use traits::Tool;
|
pub use traits::Tool;
|
||||||
|
pub use transcribe::TranscribeTool;
|
||||||
#[allow(unused_imports)]
|
#[allow(unused_imports)]
|
||||||
pub use traits::{ToolResult, ToolSpec};
|
pub use traits::{ToolResult, ToolSpec};
|
||||||
|
|
||||||
@@ -191,6 +193,8 @@ pub fn all_tools_with_runtime(
|
|||||||
tools.push(Box::new(ScreenshotTool::new(security.clone())));
|
tools.push(Box::new(ScreenshotTool::new(security.clone())));
|
||||||
tools.push(Box::new(ImageInfoTool::new(security.clone())));
|
tools.push(Box::new(ImageInfoTool::new(security.clone())));
|
||||||
|
|
||||||
|
tools.push(Box::new(TranscribeTool::new(security.clone(), None, None)));
|
||||||
|
|
||||||
if let Some(key) = composio_key {
|
if let Some(key) = composio_key {
|
||||||
if !key.is_empty() {
|
if !key.is_empty() {
|
||||||
tools.push(Box::new(ComposioTool::new(key, composio_entity_id)));
|
tools.push(Box::new(ComposioTool::new(key, composio_entity_id)));
|
||||||
|
|||||||
283
src/tools/transcribe.rs
Normal file
283
src/tools/transcribe.rs
Normal file
@@ -0,0 +1,283 @@
|
|||||||
|
use super::traits::{Tool, ToolResult};
|
||||||
|
use crate::security::SecurityPolicy;
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use serde_json::json;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::process::Command;
|
||||||
|
|
||||||
|
const MAX_AUDIO_BYTES: u64 = 104_857_600;
|
||||||
|
const SUPPORTED_FORMATS: &[&str] = &["mp3", "wav", "m4a", "flac", "ogg", "webm", "mp4", "mpeg", "mpga"];
|
||||||
|
|
||||||
|
pub struct TranscribeTool {
|
||||||
|
security: Arc<SecurityPolicy>,
|
||||||
|
model: String,
|
||||||
|
device: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TranscribeTool {
|
||||||
|
pub fn new(security: Arc<SecurityPolicy>, model: Option<String>, device: Option<String>) -> Self {
|
||||||
|
Self {
|
||||||
|
security,
|
||||||
|
model: model.unwrap_or_else(|| "base".to_string()),
|
||||||
|
device: device.unwrap_or_else(|| "cpu".to_string()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_supported_format(path: &Path) -> bool {
|
||||||
|
path.extension()
|
||||||
|
.and_then(|ext| ext.to_str())
|
||||||
|
.map(|ext| SUPPORTED_FORMATS.contains(&ext.to_lowercase().as_str()))
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn transcription_script() -> &'static str {
|
||||||
|
r#"
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
|
||||||
|
def transcribe(audio_path, model_size, device):
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
|
model = WhisperModel(model_size, device=device, compute_type="int8")
|
||||||
|
segments, info = model.transcribe(audio_path, beam_size=5)
|
||||||
|
|
||||||
|
transcription = []
|
||||||
|
for segment in segments:
|
||||||
|
transcription.append({
|
||||||
|
"start": round(segment.start, 2),
|
||||||
|
"end": round(segment.end, 2),
|
||||||
|
"text": segment.text.strip()
|
||||||
|
})
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"language": info.language,
|
||||||
|
"language_probability": round(info.language_probability, 2),
|
||||||
|
"duration": round(info.duration, 2),
|
||||||
|
"segments": transcription,
|
||||||
|
"text": " ".join(s["text"] for s in transcription)
|
||||||
|
}
|
||||||
|
|
||||||
|
print(json.dumps(result))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 4:
|
||||||
|
print(json.dumps({"error": "Usage: script.py <audio_path> <model> <device>"}))
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
transcribe(sys.argv[1], sys.argv[2], sys.argv[3])
|
||||||
|
"#
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Tool for TranscribeTool {
|
||||||
|
fn name(&self) -> &str {
|
||||||
|
"transcribe"
|
||||||
|
}
|
||||||
|
|
||||||
|
fn description(&self) -> &str {
|
||||||
|
"Transcribe audio files to text using faster-whisper. \
|
||||||
|
Supports mp3, wav, m4a, flac, ogg, webm, and other common audio formats. \
|
||||||
|
Returns the transcription with timestamps and detected language."
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parameters_schema(&self) -> serde_json::Value {
|
||||||
|
json!({
|
||||||
|
"type": "object",
|
||||||
|
"additionalProperties": false,
|
||||||
|
"properties": {
|
||||||
|
"path": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Path to the audio file to transcribe"
|
||||||
|
},
|
||||||
|
"model": {
|
||||||
|
"type": "string",
|
||||||
|
"enum": ["tiny", "base", "small", "medium", "large-v2", "large-v3"],
|
||||||
|
"description": "Whisper model size (default: base). Larger models are more accurate but slower."
|
||||||
|
},
|
||||||
|
"language": {
|
||||||
|
"type": "string",
|
||||||
|
"description": "Hint for the spoken language (e.g., 'en', 'es', 'zh'). Optional."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["path"]
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn execute(&self, args: serde_json::Value) -> anyhow::Result<ToolResult> {
|
||||||
|
let path_str = args
|
||||||
|
.get("path")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("Missing 'path' parameter"))?;
|
||||||
|
|
||||||
|
if self.security.is_rate_limited() {
|
||||||
|
return Ok(ToolResult {
|
||||||
|
success: false,
|
||||||
|
output: String::new(),
|
||||||
|
error: Some("Rate limit exceeded: too many actions in the last hour".into()),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if !self.security.is_path_allowed(path_str) {
|
||||||
|
return Ok(ToolResult {
|
||||||
|
success: false,
|
||||||
|
output: String::new(),
|
||||||
|
error: Some(format!("Path not allowed by security policy: {}", path_str)),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if !self.security.record_action() {
|
||||||
|
return Ok(ToolResult {
|
||||||
|
success: false,
|
||||||
|
output: String::new(),
|
||||||
|
error: Some("Rate limit exceeded: action budget exhausted".into()),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let path = Path::new(path_str);
|
||||||
|
let full_path = self.security.workspace_dir.join(path);
|
||||||
|
|
||||||
|
if !full_path.exists() {
|
||||||
|
return Ok(ToolResult {
|
||||||
|
success: false,
|
||||||
|
output: String::new(),
|
||||||
|
error: Some(format!("File not found: {}", path_str)),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let metadata = match std::fs::metadata(&full_path) {
|
||||||
|
Ok(m) => m,
|
||||||
|
Err(e) => {
|
||||||
|
return Ok(ToolResult {
|
||||||
|
success: false,
|
||||||
|
output: String::new(),
|
||||||
|
error: Some(format!("Cannot read file metadata: {}", e)),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if metadata.len() > MAX_AUDIO_BYTES {
|
||||||
|
return Ok(ToolResult {
|
||||||
|
success: false,
|
||||||
|
output: String::new(),
|
||||||
|
error: Some(format!(
|
||||||
|
"File too large: {} bytes (max: {} bytes)",
|
||||||
|
metadata.len(),
|
||||||
|
MAX_AUDIO_BYTES
|
||||||
|
)),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if !Self::is_supported_format(&full_path) {
|
||||||
|
let ext = full_path
|
||||||
|
.extension()
|
||||||
|
.and_then(|e| e.to_str())
|
||||||
|
.unwrap_or("unknown");
|
||||||
|
return Ok(ToolResult {
|
||||||
|
success: false,
|
||||||
|
output: String::new(),
|
||||||
|
error: Some(format!(
|
||||||
|
"Unsupported audio format: {}. Supported: {}",
|
||||||
|
ext,
|
||||||
|
SUPPORTED_FORMATS.join(", ")
|
||||||
|
)),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let model = args
|
||||||
|
.get("model")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or(&self.model);
|
||||||
|
|
||||||
|
let script = Self::transcription_script();
|
||||||
|
let output = Command::new("python3")
|
||||||
|
.arg("-c")
|
||||||
|
.arg(script)
|
||||||
|
.arg(&full_path)
|
||||||
|
.arg(model)
|
||||||
|
.arg(&self.device)
|
||||||
|
.output()
|
||||||
|
.await;
|
||||||
|
|
||||||
|
match output {
|
||||||
|
Ok(result) => {
|
||||||
|
if result.status.success() {
|
||||||
|
let stdout = String::from_utf8_lossy(&result.stdout);
|
||||||
|
match serde_json::from_str::<serde_json::Value>(&stdout) {
|
||||||
|
Ok(json) => {
|
||||||
|
let text = json
|
||||||
|
.get("text")
|
||||||
|
.and_then(|t| t.as_str())
|
||||||
|
.unwrap_or(&stdout);
|
||||||
|
let language = json
|
||||||
|
.get("language")
|
||||||
|
.and_then(|l| l.as_str())
|
||||||
|
.unwrap_or("unknown");
|
||||||
|
let duration = json
|
||||||
|
.get("duration")
|
||||||
|
.and_then(|d| d.as_f64())
|
||||||
|
.unwrap_or(0.0);
|
||||||
|
|
||||||
|
let duration_f = duration;
|
||||||
|
let mut out = format!(
|
||||||
|
"**Transcription** ({:.1}, language: {})\n\n{}\n",
|
||||||
|
duration_f, language, text
|
||||||
|
);
|
||||||
|
|
||||||
|
if let Some(segments) = json.get("segments").and_then(|s| s.as_array())
|
||||||
|
{
|
||||||
|
if segments.len() > 1 {
|
||||||
|
out.push_str("\n**Segments:**\n");
|
||||||
|
for seg in segments.iter().take(20) {
|
||||||
|
if let (Some(start), Some(end), Some(seg_text)) = (
|
||||||
|
seg.get("start").and_then(|v| v.as_f64()),
|
||||||
|
seg.get("end").and_then(|v| v.as_f64()),
|
||||||
|
seg.get("text").and_then(|v| v.as_str()),
|
||||||
|
) {
|
||||||
|
let start_f = start;
|
||||||
|
let end_f = end;
|
||||||
|
out.push_str(&format!(
|
||||||
|
"[{:05.1} - {:05.1}] {}\n",
|
||||||
|
start_f, end_f, seg_text
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if segments.len() > 20 {
|
||||||
|
out.push_str(&format!(
|
||||||
|
"... and {} more segments\n",
|
||||||
|
segments.len() - 20
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(ToolResult {
|
||||||
|
success: true,
|
||||||
|
output: out,
|
||||||
|
error: None,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
Err(_) => Ok(ToolResult {
|
||||||
|
success: true,
|
||||||
|
output: stdout.to_string(),
|
||||||
|
error: None,
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let stderr = String::from_utf8_lossy(&result.stderr);
|
||||||
|
Ok(ToolResult {
|
||||||
|
success: false,
|
||||||
|
output: String::new(),
|
||||||
|
error: Some(format!("Transcription failed: {}", stderr.trim())),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => Ok(ToolResult {
|
||||||
|
success: false,
|
||||||
|
output: String::new(),
|
||||||
|
error: Some(format!("Failed to run transcription: {}", e)),
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user