add yt_dlp module + db

This commit is contained in:
JMARyA 2024-03-10 04:52:50 +01:00
parent 5941f61c8c
commit 1f32c21363
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
8 changed files with 447 additions and 159 deletions

6
Cargo.lock generated
View file

@ -214,6 +214,7 @@ version = "0.1.0"
dependencies = [
"chrono",
"env_logger",
"jobdispatcher",
"log",
"rusqlite",
"serde",
@ -266,6 +267,11 @@ version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c"
[[package]]
name = "jobdispatcher"
version = "0.1.0"
source = "git+https://git.hydrar.de/jmarya/jobdispatcher#df3bbb09ab2b2cace22d052e4a22370c88be9f2c"
[[package]]
name = "js-sys"
version = "0.3.69"

View file

@ -13,3 +13,4 @@ rusqlite = "0.30.0"
serde = { version = "1.0.196", features = ["derive"] }
serde_json = "1.0.113"
toml = "0.8.10"
jobdispatcher = { git = "https://git.hydrar.de/jmarya/jobdispatcher" }

View file

@ -12,4 +12,50 @@ output_format = "%(title)s [%(id)s].%(ext)s"
[youtube.channels]
# Channel Mappings
MentalOutlaw = "https://www.youtube.com/@MentalOutlaw"
MentalOutlaw = "https://www.youtube.com/@MentalOutlaw"
[[yt_dlp]]
# Module Name
name = "Custom-yt_dlp"
# Interval in minutes between checks
interval = 30
# Amount of items to query
limit = 10
# Format of the Thumbnail
thumbnail_format = "jpg"
# Output Template for yt-dlp
output_format = "%(title)s [%(id)s].%(ext)s"
# Download description
write_description = false
# Download info.json
write_info_json = false
# Download comments
write_comments = false
# Download thumbnail
write_thumbnail = true
# Download subtitles
write_subs = false
# Extract audio
audio_only = false
# Audio Format
audio_format = "m4a"
# Embed subtitles
embed_subs = false
# Embed thumbnail
embed_thumbnail = false
# Embed metadata
embed_metadata = true
# Embed chapters
embed_chapters = true
# Embed info.json
embed_info_json = true
# Split by chapter
split_chapters = false
# Format Selection
format = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio"
# Cookie File
cookie = "cookies.txt"
# Items to check
[yt_dlp.items]
Item = "url"

View file

@ -2,6 +2,8 @@ use std::path::PathBuf;
use serde::{Deserialize, Serialize};
use crate::yt_dlp::YtDlpConfig;
/// General settings for hoard
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct HoardConfig {
@ -16,4 +18,6 @@ pub struct GlobalConfig {
pub hoard: HoardConfig,
// Configuration for the YouTube Module
pub youtube: Option<crate::youtube::YouTubeConfig>,
// Custom instances of yt-dlp
pub yt_dlp: Option<Vec<YtDlpConfig>>,
}

View file

@ -1,16 +1,19 @@
use jobdispatcher::{JobDispatcher, JobOrder};
use rusqlite::Connection;
use std::error::Error;
use std::sync::{mpsc::Receiver, Arc};
// todo : make db singleton
#[derive(Debug, Clone)]
pub struct Database {
file: String,
pub struct DatabaseBackend {
pub file: String,
pub conn: Connection,
pub dispatcher: Arc<JobDispatcher<Query, Out>>,
pub recv: Receiver<JobOrder<Query, Out>>,
}
impl Database {
impl DatabaseBackend {
pub fn new(file: &str) -> Self {
let (dispatcher, recv) = jobdispatcher::JobDispatcher::<Query, Out>::new();
let conn = Connection::open(file).unwrap();
conn.execute(
"CREATE TABLE IF NOT EXISTS urls (
id INTEGER PRIMARY KEY,
@ -21,25 +24,74 @@ impl Database {
)
.unwrap();
let dispatcher = Arc::new(dispatcher);
Self {
file: file.to_string(),
conn,
dispatcher,
recv,
}
}
pub fn insert_url(&self, url: &str) -> Result<(), Box<dyn Error>> {
let conn = Connection::open(&self.file)?;
let timestamp = chrono::Local::now().to_rfc3339();
conn.execute(
"INSERT INTO urls (url, timestamp) VALUES (?, ?)",
[url, &timestamp],
)?;
Ok(())
pub fn take_db(&self) -> Database {
Database::new(self.dispatcher.clone())
}
pub fn check_for_url(&self, url: &str) -> Result<bool, Box<dyn Error>> {
let conn = Connection::open(&self.file)?;
let mut stmt = conn.prepare("SELECT COUNT(*) FROM urls WHERE url = ?")?;
let count: i64 = stmt.query_row([url], |row| row.get(0))?;
Ok(count > 0)
pub fn run(&self) {
while let Ok(job) = self.recv.recv() {
match job.param {
Query::InsertUrl(ref url) => {
let timestamp = chrono::Local::now().to_rfc3339();
self.conn
.execute(
"INSERT INTO urls (url, timestamp) VALUES (?, ?)",
[url, &timestamp],
)
.unwrap();
job.done(Out::Ok);
}
Query::CheckForUrl(ref url) => {
let conn = Connection::open(&self.file).unwrap();
let mut stmt = conn
.prepare("SELECT COUNT(*) FROM urls WHERE url = ?")
.unwrap();
let count: i64 = stmt.query_row([url], |row| row.get(0)).unwrap();
job.done(Out::Bool(count > 0));
}
}
}
}
}
pub enum Query {
InsertUrl(String),
CheckForUrl(String),
}
pub enum Out {
Ok,
Bool(bool),
// Rows(Vec<String>),
}
#[derive(Clone)]
pub struct Database {
conn: Arc<JobDispatcher<Query, Out>>,
}
impl Database {
pub fn new(conn: Arc<JobDispatcher<Query, Out>>) -> Self {
Self { conn }
}
pub fn insert_url(&self, url: &str) {
self.conn.send(Query::InsertUrl(url.to_string()));
}
pub fn check_for_url(&self, url: &str) -> bool {
match self.conn.send(Query::CheckForUrl(url.to_string())) {
Out::Ok => false,
Out::Bool(b) => b,
}
}
}

View file

@ -3,9 +3,12 @@ use std::path::PathBuf;
mod config;
mod db;
mod youtube;
mod yt_dlp;
use config::GlobalConfig;
use crate::yt_dlp::YtDlpModule;
// todo : migrate to async code?
// todo : better log options
@ -38,18 +41,33 @@ fn main() {
log::info!("Starting hoard");
let db = db::Database::new("download.db");
let db = db::DatabaseBackend::new("download.db");
let config: GlobalConfig =
toml::from_str(&std::fs::read_to_string("config.toml").unwrap()).unwrap();
ensure_dir_exists(&config.hoard.data_dir);
let modules: Vec<Box<dyn Module>> = vec![Box::new(youtube::YouTubeModule::new(
let mut modules: Vec<Box<dyn Module>> = vec![Box::new(youtube::YouTubeModule::new(
config.youtube.unwrap(),
db,
db.take_db(),
config.hoard.data_dir.join("youtube"),
))];
for yt_dlp_mod in config.yt_dlp.unwrap_or_default() {
let mod_name = yt_dlp_mod
.name
.clone()
.unwrap_or_else(|| "yt_dlp".to_string());
modules.push(Box::new(YtDlpModule::new(
yt_dlp_mod,
db.take_db(),
config.hoard.data_dir.join(mod_name),
)));
}
let _db_thread = std::thread::spawn(move || {
db.run();
});
let threads: Vec<_> = modules
.into_iter()
.map(|x| {

View file

@ -1,13 +1,11 @@
use std::{
collections::HashMap,
io::{BufRead, BufReader},
path::PathBuf,
process::Command,
};
use std::{collections::HashMap, path::PathBuf};
use serde::{Deserialize, Serialize};
use crate::{ensure_dir_exists, Module};
use crate::{
yt_dlp::{YtDlpConfig, YtDlpModule},
Module,
};
/// Configuration for the `YouTube` Module
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -22,30 +20,69 @@ pub struct YouTubeConfig {
thumbnail_format: Option<String>,
// Output Template for yt-dlp
output_format: Option<String>,
// Download description
pub write_description: Option<bool>,
// Download info.json
pub write_info_json: Option<bool>,
// Download comments
pub write_comments: Option<bool>,
// Download thumbnail
pub write_thumbnail: Option<bool>,
// Download subtitles
pub write_subs: Option<bool>,
// Embed subtitles
pub embed_subs: Option<bool>,
// Embed thumbnail
pub embed_thumbnail: Option<bool>,
// Embed metadata
pub embed_metadata: Option<bool>,
// Embed chapters
embed_chapters: Option<bool>,
// Embed info.json
pub embed_info_json: Option<bool>,
// Split by chapter
pub split_chapters: Option<bool>,
// Format Selection
pub format: Option<String>,
// Cookie File
pub cookie: Option<String>,
}
impl YouTubeConfig {
pub fn download_options(&self) -> DownloadOptions {
DownloadOptions {
thumbnail_format: self.thumbnail_format.clone(),
output_format: self.output_format.clone(),
}
}
}
#[derive(Clone, Debug)]
#[derive(Clone)]
pub struct YouTubeModule {
config: YouTubeConfig,
db: crate::db::Database,
root_dir: PathBuf,
yt_dlp: YtDlpModule,
}
impl YouTubeModule {
pub const fn new(config: YouTubeConfig, db: crate::db::Database, root_dir: PathBuf) -> Self {
pub fn new(config: YouTubeConfig, db: crate::db::Database, root_dir: PathBuf) -> Self {
Self {
config,
db,
root_dir,
yt_dlp: YtDlpModule::new(
YtDlpConfig {
name: Some("youtube".to_string()),
interval: config.interval,
limit: config.limit,
items: config.channels,
thumbnail_format: config.thumbnail_format,
output_format: config.output_format.clone(),
write_description: Some(config.write_description.unwrap_or(true)),
write_info_json: config.write_info_json,
write_comments: config.write_comments,
write_thumbnail: Some(config.write_thumbnail.unwrap_or(true)),
write_subs: config.write_subs,
audio_format: None,
embed_subs: config.embed_subs,
embed_thumbnail: config.embed_thumbnail,
embed_metadata: config.embed_metadata,
embed_chapters: config.embed_chapters,
embed_info_json: config.embed_info_json,
split_chapters: config.split_chapters,
format: config.format,
cookie: config.cookie,
audio_only: Some(false),
},
db,
root_dir,
),
}
}
}
@ -56,115 +93,6 @@ impl Module for YouTubeModule {
}
fn run(&self) {
loop {
log::info!("Running YouTube Module");
let download_options = self.config.download_options();
log::info!("Checking {} channels", self.config.channels.len());
for (channel, channel_url) in &self.config.channels {
log::info!("Fetching \"{channel}\" videos");
match Self::get_latest_channel_videos(channel_url, self.config.limit.unwrap_or(10))
{
Ok(latest_videos) => {
for (video_title, video_url) in latest_videos {
if self.db.check_for_url(&video_url).unwrap() {
log::trace!(
"Skipping \"{video_title}\" because it was already downloaded"
);
} else {
match Self::download_video(
&video_url,
&self.root_dir.join(channel),
&download_options,
) {
Ok(()) => {
// mark as downloaded
self.db.insert_url(&video_url).unwrap();
log::info!("Downloaded \"{video_title}\"");
}
Err(e) => {
log::error!(
"Error downloading \"{video_title}\"; Reason: {e}"
);
}
}
}
}
}
Err(e) => {
log::error!("Could not get videos from \"{channel}\". Reason: {e}");
}
}
}
log::info!(
"Check complete. Sleeping for {} minutes...",
self.config.interval
);
std::thread::sleep(std::time::Duration::from_secs(self.config.interval * 60));
}
self.yt_dlp.run();
}
}
impl YouTubeModule {
fn get_latest_channel_videos(
channel: &str,
limit: u64,
) -> Result<Vec<(String, String)>, String> {
let output = Command::new("yt-dlp")
.arg("--no-warnings")
.arg("--flat-playlist")
.arg("--skip-download")
.arg("--print")
.arg("title,webpage_url")
.arg("--playlist-end")
.arg(limit.to_string())
.arg(channel)
.output()
.expect("Failed to execute yt-dlp");
if !output.status.success() {
return Err(String::from_utf8(output.stderr).unwrap());
}
let reader = BufReader::new(&output.stdout[..]);
let mut videos = Vec::new();
let mut lines = reader.lines();
while let (Some(title), Some(url)) = (lines.next(), lines.next()) {
if let (Ok(title), Ok(url)) = (title, url) {
videos.push((title, url));
}
}
Ok(videos.into_iter().take(limit as usize).collect())
}
fn download_video(video_url: &str, cwd: &PathBuf, opt: &DownloadOptions) -> Result<(), String> {
ensure_dir_exists(cwd);
let output = Command::new("yt-dlp")
.current_dir(cwd)
.arg("--downloader")
.arg("aria2c")
.arg("--write-thumbnail")
.arg("-o")
.arg(opt.output_format.as_deref().unwrap_or("%(title)s.%(ext)s"))
.arg("--embed-thumbnail")
.arg("--embed-chapters")
.arg("--embed-info-json")
.arg("--convert-thumbnails")
.arg(opt.thumbnail_format.as_deref().unwrap_or("jpg"))
.arg(video_url)
.output()
.map_err(|_| "yt-dlp command failed".to_string())?;
if !output.status.success() {
let error_message = String::from_utf8_lossy(&output.stderr).to_string();
return Err(error_message);
}
Ok(())
}
}
pub struct DownloadOptions {
thumbnail_format: Option<String>,
output_format: Option<String>,
}

233
src/yt_dlp/mod.rs Normal file
View file

@ -0,0 +1,233 @@
use std::{
collections::HashMap,
io::{BufRead, BufReader},
path::PathBuf,
process::Command,
};
use serde::{Deserialize, Serialize};
use crate::{ensure_dir_exists, Module};
/// Configuration for the `YouTube` Module
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct YtDlpConfig {
// Module Name
pub name: Option<String>,
// Interval in minutes between checks
pub interval: u64,
/// Amount of items to query
pub limit: Option<u64>,
// Items to check
pub items: HashMap<String, String>,
// Format of the Thumbnail
pub thumbnail_format: Option<String>,
// Output Template for yt-dlp
pub output_format: Option<String>,
// Download description
pub write_description: Option<bool>,
// Download info.json
pub write_info_json: Option<bool>,
// Download comments
pub write_comments: Option<bool>,
// Download thumbnail
pub write_thumbnail: Option<bool>,
// Download subtitles
pub write_subs: Option<bool>,
// Extract audio
pub audio_only: Option<bool>,
// Audio Format
pub audio_format: Option<String>,
// Embed subtitles
pub embed_subs: Option<bool>,
// Embed thumbnail
pub embed_thumbnail: Option<bool>,
// Embed metadata
pub embed_metadata: Option<bool>,
// Embed chapters
pub embed_chapters: Option<bool>,
// Embed info.json
pub embed_info_json: Option<bool>,
// Split by chapter
pub split_chapters: Option<bool>,
// Format Selection
pub format: Option<String>,
// Cookie File
pub cookie: Option<String>,
}
#[derive(Clone)]
pub struct YtDlpModule {
config: YtDlpConfig,
db: crate::db::Database,
root_dir: PathBuf,
}
impl YtDlpModule {
pub const fn new(config: YtDlpConfig, db: crate::db::Database, root_dir: PathBuf) -> Self {
Self {
config,
db,
root_dir,
}
}
}
impl Module for YtDlpModule {
fn name(&self) -> String {
self.config
.name
.clone()
.unwrap_or_else(|| "yt-dlp".to_string())
}
fn run(&self) {
loop {
log::info!("Running {} Module", self.name());
log::info!("Checking {} items", self.config.items.len());
for (item, item_url) in &self.config.items {
log::info!("Fetching \"{item}\" videos");
match Self::get_latest_entries(item_url, self.config.limit.unwrap_or(10)) {
Ok(latest_videos) => {
for (video_title, video_url) in latest_videos {
if self.db.check_for_url(&video_url) {
log::trace!(
"Skipping \"{video_title}\" because it was already downloaded"
);
} else {
match self.download(&video_url, &self.root_dir.join(item)) {
Ok(()) => {
// mark as downloaded
self.db.insert_url(&video_url);
log::info!("Downloaded \"{video_title}\"");
}
Err(e) => {
log::error!(
"Error downloading \"{video_title}\"; Reason: {e}"
);
}
}
}
}
}
Err(e) => {
log::error!("Could not get videos from \"{item}\". Reason: {e}");
}
}
}
log::info!(
"{} complete. Sleeping for {} minutes...",
self.name(),
self.config.interval
);
std::thread::sleep(std::time::Duration::from_secs(self.config.interval * 60));
}
}
}
impl YtDlpModule {
fn get_latest_entries(channel: &str, limit: u64) -> Result<Vec<(String, String)>, String> {
let output = Command::new("yt-dlp")
.arg("--no-warnings")
.arg("--flat-playlist")
.arg("--skip-download")
.arg("--print")
.arg("title,webpage_url")
.arg("--playlist-end")
.arg(limit.to_string())
.arg(channel)
.output()
.expect("Failed to execute yt-dlp");
if !output.status.success() {
return Err(String::from_utf8(output.stderr).unwrap());
}
let reader = BufReader::new(&output.stdout[..]);
let mut videos = Vec::new();
let mut lines = reader.lines();
while let (Some(title), Some(url)) = (lines.next(), lines.next()) {
if let (Ok(title), Ok(url)) = (title, url) {
videos.push((title, url));
}
}
Ok(videos.into_iter().take(limit as usize).collect())
}
fn download(&self, video_url: &str, cwd: &PathBuf) -> Result<(), String> {
ensure_dir_exists(cwd);
let mut command = Command::new("yt-dlp");
let mut command = command.current_dir(cwd).arg("--downloader").arg("aria2c");
if self.config.write_thumbnail.unwrap_or(true) {
command = command.arg("--write-thumbnail");
}
if self.config.write_description.unwrap_or(false) {
command = command.arg("--write-description");
}
if self.config.write_info_json.unwrap_or(false) {
command = command.arg("--write-info-json");
}
if self.config.write_comments.unwrap_or(false) {
command = command.arg("--write-comments");
}
if self.config.write_subs.unwrap_or(false) {
command = command.arg("--write-subs");
}
if self.config.audio_only.unwrap_or(false) {
command = command.arg("--extract-audio");
}
if let Some(audio_format) = &self.config.audio_format {
command = command.arg("--audio-format").arg(audio_format);
}
if self.config.embed_chapters.unwrap_or(true) {
command = command.arg("--embed-chapters");
}
if self.config.embed_info_json.unwrap_or(true) {
command = command.arg("--embed-info-json");
}
if self.config.embed_metadata.unwrap_or(true) {
command = command.arg("--embed-metadata");
}
if self.config.embed_subs.unwrap_or(false) {
command = command.arg("--embed-subs");
}
if self.config.embed_thumbnail.unwrap_or(true) {
command = command.arg("--embed-thumbnail");
}
if self.config.split_chapters.unwrap_or(false) {
command = command.arg("--split-chapters");
}
if let Some(format) = &self.config.format {
command = command.arg("--format").arg(format);
}
if let Some(cookie) = &self.config.cookie {
command = command.arg("--cookies").arg(cookie);
}
let output = command
.arg("--convert-thumbnails")
.arg(self.config.thumbnail_format.as_deref().unwrap_or("jpg"))
.arg("-o")
.arg(
self.config
.output_format
.as_deref()
.unwrap_or("%(title)s.%(ext)s"),
)
.arg(video_url)
.output()
.map_err(|_| "yt-dlp command failed".to_string())?;
if !output.status.success() {
let error_message = String::from_utf8_lossy(&output.stderr).to_string();
return Err(error_message);
}
Ok(())
}
}