This commit is contained in:
JMARyA 2023-08-26 13:45:43 +02:00
commit 72a8357548
Signed by: jmarya
GPG key ID: 901B2ADDF27C2263
18 changed files with 5327 additions and 0 deletions

2
.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
/target
.vscode

1835
Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

23
Cargo.toml Normal file
View file

@ -0,0 +1,23 @@
[package]
name = "scrape"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
thirtyfour = "0.31.0"
tokio = { version = "1.32.0", features = ["full"] }
clap = "2.33"
strum = { version = "0.21", features = ["derive"] }
regex = "1.9.3"
chrono = "0.4.26"
url = "2.4.0"
serde = "1.0.183"
serde_json = "1.0.105"
async-trait = "0.1.73"
log = "0.4"
env_logger = "0.10"
base64 = "0.21.2"
reqwest = { version = "0.11.18" }
urlencoding = "2.1.3"

3
README.md Normal file
View file

@ -0,0 +1,3 @@
# scrape
Scrape is a tool to scrape websites and turn their data into JSON. Like yt-dlp, but for web scraping.

59
docs/add-new-extractor.md Normal file
View file

@ -0,0 +1,59 @@
# Adding a new Extractor
## Create a New Extractor Source File
Create a new Rust source file inside `src/extractors`.
`src/extractors/myext.rs`:
```rust
use super::prelude::*;
pub struct MySiteExtractor;
impl MySiteExtractor {
pub fn new() -> Self { Self {} }
}
```
## Implement the Extractor Trait
Implement the Extractor trait by providing the required methods: `supported_hosts`, `name`, and `run_scrape`.
```rust
#[async_trait]
impl Extractor for MySiteExtractor {
fn supported_hosts(&self) -> Vec<&str> {
vec!["my-site.com"]
}
fn name(&self) -> String {
"My Site Extractor".to_string()
}
async fn run_scrape(
&self,
url: Url,
browser: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
// scraping logic
unimplemented!()
}
}
```
## Register the Extractor
In `src/extractors/mod.rs` add the following:
```rust
pub mod myext;
[...]
#[must_use]
pub fn get_extractors() -> Vec<Box<dyn Extractor>> {
vec![
...
// Add your new extractor
Box::new(myext::MySiteExtractor::new()),
...
]
}
```

123
src/extractors/amazon.rs Normal file
View file

@ -0,0 +1,123 @@
use crate::{
util::{currency, escape_key},
Language,
};
use super::prelude::*;
pub struct AmazonExtractor {}
impl AmazonExtractor {
pub fn new() -> Self {
Self {}
}
pub async fn amazon_product(
&self,
url: Url,
b: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
let mut url = url;
match conf.language {
Language::en_US => {
url.query_pairs_mut().append_pair("language", "en_GB");
b.goto(url.to_string()).await.unwrap();
}
_ => {
url.query_pairs_mut()
.append_pair("language", &conf.language.to_string());
b.goto(url.to_string()).await.unwrap();
}
}
log::info!(
"Changing to '{url}' because of {:?} language",
conf.language
);
let mut info: HashMap<String, Value> = HashMap::new();
b.click_on_xpath(r#"//*[@id="sp-cc-accept"]"#).await;
info.insert(
"product_title".into(),
b.get_element_text_by_xpath(r#"//*[@id="productTitle"]"#)
.await
.unwrap()
.into(),
);
info.insert(
"star_rating".into(),
b.get_element_text_by_xpath(
r#"//*[@class="reviewCountTextLinkedHistogram noUnderline"]/span[1]/a/span"#,
)
.await
.unwrap()
.replace(',', ".")
.parse::<f64>()
.unwrap()
.into(),
);
let price_symbol = b
.get_element_text_by_xpath(
r#"//*[@id="corePrice_feature_div"]//span[@class="a-price-symbol"]"#,
)
.await
.unwrap();
let whole = b
.get_element_text_by_xpath(
r#"//*[@id="corePrice_feature_div"]//span[@class="a-price-whole"]"#,
)
.await
.unwrap()
.replace(['.', ','], "");
let fraction = b
.get_element_text_by_xpath(
r#"//*[@id="corePrice_feature_div"]//span[@class="a-price-fraction"]"#,
)
.await
.unwrap();
info.insert(
"price".into(),
currency(&format!("{whole}.{fraction}{price_symbol}")).into(),
);
let mut tech_details: Map<String, Value> = Map::new();
let tech_details_html = b
.get_element_by_xpath(r#"//*[@id="productDetails_techSpec_section_1"]/tbody"#)
.await
.unwrap();
for detail in tech_details_html.find_all(By::Tag("tr")).await.unwrap() {
let key = text_from!(detail.find(By::Tag("th")).await.unwrap());
let value = text_from!(detail.find(By::Tag("td")).await.unwrap());
tech_details.insert(escape_key(&key), value.into());
}
info.insert("technical_details".into(), tech_details.into());
Ok(info)
}
}
#[async_trait]
impl Extractor for AmazonExtractor {
fn name(&self) -> String {
"AMAZON".to_owned()
}
fn supported_hosts(&self) -> Vec<&str> {
vec!["www.amazon.de"]
}
async fn run_scrape(
&self,
url: Url,
browser: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
self.amazon_product(url, browser, conf).await
}
}

343
src/extractors/anilist.rs Normal file
View file

@ -0,0 +1,343 @@
use regex::Regex;
use crate::util::{escape_key, extract_texts_from_elements, handle_media_url};
use super::prelude::*;
pub struct AnilistExtractor {}
impl AnilistExtractor {
pub fn new() -> Self {
Self {}
}
async fn anime(
&self,
url: Url,
b: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
b.goto(url.to_string()).await.unwrap();
let mut info: HashMap<String, Value> = HashMap::new();
if b.wait_for(r#"//span[text()="AGREE"]"#, Duration::from_secs(5))
.await
{
b.click_on_xpath(r#"//span[text()="AGREE"]"#).await;
}
if b.wait_for(
r#"//div[@class="header"]//div[@class="content"]/h1"#,
Duration::from_secs(5),
)
.await
{
info.insert(
"title".into(),
b.get_element_text_by_xpath(r#"//div[@class="header"]//div[@class="content"]/h1"#)
.await
.unwrap()
.into(),
);
info.insert(
"cover".into(),
handle_media_url(
&b.get_element_attr_by_xpath(
r#"//div[@class="header"]//div[@class="cover-wrap-inner"]/img"#,
"src",
)
.await
.unwrap(),
"cover",
false,
conf,
)
.await
.into(),
);
if let Some(desc_show_more) = b
.get_element_by_xpath(r#"//span[@class="description-length-toggle"]"#)
.await
{
b.scroll_to_element(&desc_show_more).await.unwrap();
desc_show_more.click().await.unwrap();
}
info.insert(
"description".into(),
b.get_element_text_by_xpath(r#"//div[@class="header"]//p[@class="description"]"#)
.await
.unwrap()
.into(),
);
let data_html = b
.get_element_by_xpath(r#"//div[@class="data"]"#)
.await
.unwrap();
for data_field in data_html.find_all(By::XPath("./div")).await.unwrap() {
let key = escape_key(&text_from!(data_field
.find(By::XPath(r#"./div[@class="type"]"#))
.await
.unwrap()));
let val = data_field
.find(By::XPath(r#"./*[@class="value"]"#))
.await
.unwrap();
let val_list = val.find_all(By::XPath("./span")).await.unwrap();
let val_list = extract_texts_from_elements(val_list).await;
// TODO : parse data fields
if val_list.is_empty() {
info.insert(key, text_from!(val).into());
} else {
info.insert(key, val_list.into());
}
}
let mut tags: Vec<String> = vec![];
let tags_html = b
.get_element_by_xpath(r#"//div[@class="tags"]"#)
.await
.unwrap();
for tag_html in tags_html
.find_all(By::XPath(r#"./div[@class="tag"]"#))
.await
.unwrap()
{
tags.push(text_from!(tag_html
.find(By::XPath(r#"./a[1]"#))
.await
.unwrap()));
}
info.insert("tags".into(), tags.into());
let mut websites: Map<String, Value> = Map::new();
let websites_html = b
.get_element_by_xpath(r#"//div[@class="external-links"]/div"#)
.await
.unwrap();
for website in websites_html.find_all(By::XPath("./a")).await.unwrap() {
let web_link = attr_from!(website, "href");
let mut web_name = text_from!(website
.find(By::XPath(r#"./span[@class="name"]"#))
.await
.unwrap());
let language = match website
.find(By::XPath(r#".//span[@class="language"]"#))
.await
{
Ok(el) => Some(text_from!(el)),
Err(_) => None,
};
if language.is_some() {
web_name = web_name.replace(&format!(" {}", language.clone().unwrap()), "");
}
websites.insert(
web_name,
json!({
"url": web_link,
"language": language
}),
);
}
info.insert("websites".into(), websites.into());
let mut relations: Vec<Value> = vec![];
let relations_html = b
.get_element_by_xpath(r#"//div[@class="relations"]"#)
.await
.unwrap();
for relation_html in relations_html
.find_all(By::XPath(r#"./div/div"#))
.await
.unwrap()
{
let relation_info = text_from!(relation_html
.find(By::XPath(r#"./div[@class="content"]/div[@class="info"]"#))
.await
.unwrap());
let (rel_type, rel_status) = relation_info.split_once(" · ").unwrap();
let relation = json!({
"url": b.get_url_from_link(relation_html.find(By::XPath("./a")).await.unwrap()).await,
"kind": text_from!(relation_html.find(By::XPath(r#"./div[@class="content"]/div[@class="info-header"]/div"#)).await.unwrap()),
"title": text_from!(relation_html.find(By::XPath(r#"./div[@class="content"]/a[@class="title"]"#)).await.unwrap()),
"type": rel_type,
"status": rel_status
});
relations.push(relation);
}
info.insert("relations".into(), relations.into());
let recommendations_html = b
.get_element_by_xpath(r#"//div[@class="recommendations"]"#)
.await
.unwrap();
let show_all_button = recommendations_html
.find(By::XPath(r#".//div[@class="view-all"]/div"#))
.await
.unwrap();
b.scroll_to_element(&show_all_button).await.unwrap();
show_all_button.click().await.unwrap();
let mut recommendations: Map<String, Value> = Map::new();
for rec_html in recommendations_html
.find_all(By::XPath(r#"./div/div[@class="recommendation-card"]/a"#))
.await
.unwrap()
{
let rec_url = b.get_url_from_link(rec_html.clone()).await;
let rec_title = text_from!(rec_html);
recommendations.insert(rec_title, rec_url.into());
}
info.insert("recommendations".into(), recommendations.into());
let mut nav_urls: Vec<String> = vec![];
for nav in b
.get_elements_by_xpath(r#"//div[@class="nav"]/a"#)
.await
.unwrap()
{
nav_urls.push(attr_from!(nav, "href"));
}
for nav_url in nav_urls {
if nav_url.ends_with("characters") {
info.insert(
"characters".into(),
self.characters_page(&nav_url, b, conf).await?.into(),
);
}
if nav_url.ends_with("staff") {
info.insert(
"staff".into(),
self.staff_page(&nav_url, b, conf).await?.into(),
);
}
if nav_url.ends_with("stats") {
info.insert(
"stats".into(),
self.stats_page(&nav_url, b, conf).await?.into(),
);
}
// todo : reviews?
}
} else {
Err("Scrape failed")?;
}
Ok(info)
}
async fn characters_page(
&self,
_url: &str,
_b: &mut crate::Browser,
_conf: &Config,
) -> Result<Map<String, Value>, String> {
// todo : character page
Ok(Map::new())
}
async fn staff_page(
&self,
_url: &str,
_b: &mut crate::Browser,
_conf: &Config,
) -> Result<Map<String, Value>, String> {
// todo : staff page
Ok(Map::new())
}
async fn stats_page(
&self,
url: &str,
b: &mut crate::Browser,
_conf: &Config,
) -> Result<Map<String, Value>, String> {
let mut data = Map::new();
b.goto(url).await.unwrap();
if b.wait_for(r#"//div[@class="rankings graph"]"#, Duration::from_secs(5))
.await
{
let mut ranking = Map::new();
for rank in b.get_elements_by_xpath(r#"//div[@class="rankings graph"]/a[@class="ranking popular"]/span[@class="rank-text"]"#).await.unwrap() {
let re = Regex::new(r"#(\d+)").unwrap();
if let Some(matched) = re.find(&rank.text().await.unwrap()) {
if let Ok(rank_num) = matched.as_str()[1..].parse::<i32>() {
let rank_key = text_from!(rank).replacen(matched.as_str(), "", 1).trim().to_string();
ranking.insert(escape_key(&rank_key), rank_num.into());
}
}
}
for rank in b.get_elements_by_xpath(r#"//div[@class="rankings graph"]/a[@class="ranking rated"]/span[@class="rank-text"]"#).await.unwrap() {
let re = Regex::new(r"#(\d+)").unwrap();
if let Some(matched) = re.find(&rank.text().await.unwrap()) {
if let Ok(rank_num) = matched.as_str()[1..].parse::<i32>() {
let rank_key = text_from!(rank).replacen(matched.as_str(), "", 1).trim().to_string();
ranking.insert(escape_key(&rank_key), rank_num.into());
}
}
}
data.insert("ranking".into(), ranking.into());
let mut viewer_status: Map<String, Value> = Map::new();
if b.wait_for(r#"//div[@class="status-distribution content-wrap"]/div[@class="statuses"]/div[@class="status"]"#, Duration::from_secs(5)).await {
b.scroll_to_element(&b.get_element_by_xpath(r#"//div[@class="status-distribution content-wrap"]/div[@class="statuses"]/div[@class="status"]"#).await.unwrap()).await.unwrap();
}
let viewer_status_dist = b.get_elements_by_xpath(r#"//div[@class="status-distribution content-wrap"]/div[@class="statuses"]/div[@class="status"]"#).await.unwrap();
for status in viewer_status_dist {
b.scroll_to_element(&status).await.unwrap();
let key = text_from!(status
.find(By::XPath(r#"./div[@class="name"]"#))
.await
.unwrap());
if !key.is_empty() {
let val: isize = text_from!(status
.find(By::XPath(r#"./div[@class="amount"]"#))
.await
.unwrap())
.replace(" Users", "")
.parse()
.unwrap();
viewer_status.insert(escape_key(&key), val.into());
}
}
data.insert("viewer_status".into(), viewer_status.into());
// TODO : not complete
} else {
Err("Scrape failed")?;
}
Ok(data)
}
}
#[async_trait]
impl Extractor for AnilistExtractor {
fn name(&self) -> String {
"ANILIST".to_owned()
}
fn supported_hosts(&self) -> Vec<&str> {
vec!["anilist.co"]
}
async fn run_scrape(
&self,
url: Url,
browser: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
self.anime(url, browser, conf).await
}
}

289
src/extractors/anisearch.rs Normal file
View file

@ -0,0 +1,289 @@
use thirtyfour::WebElement;
use crate::util::{escape_key, handle_media_url, remove_last_n_chars};
use super::prelude::*;
async fn split_header(el: &WebElement) -> (String, String) {
let header = text_from!(el
.find(By::XPath(r#".//span[@class="header"]"#))
.await
.unwrap());
let value = text_from!(el);
let value = value.replacen(&header, "", 1).trim().to_owned();
(header.replace(':', ""), value)
}
pub struct AnisearchExtractor {}
impl AnisearchExtractor {
pub fn new() -> Self {
Self {}
}
async fn anime_search(
&self,
url: Url,
b: &mut crate::Browser,
_conf: &Config,
) -> Result<HashMap<String, Value>, String> {
b.goto(url.to_string()).await.unwrap();
let mut info: HashMap<String, Value> = HashMap::new();
info.insert(
"query".into(),
remove_last_n_chars(
&b.get_element_text_by_xpath(r#"//*[@id="item-key-a-text"]"#)
.await
.unwrap()
.replace("Title starts with \"", ""),
1,
)
.into(),
);
let mut results: Vec<String> = vec![];
for result in b
.get_elements_by_xpath(r#"//ul[@class="covers"]/li"#)
.await
.unwrap()
{
let link = b
.get_url_from_link(result.find(By::XPath(".//a")).await.unwrap())
.await;
results.push(link);
}
info.insert("results".into(), results.into());
Ok(info)
}
async fn anime(
&self,
url: Url,
b: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
let mut info: HashMap<String, Value> = HashMap::new();
b.goto(url.to_string()).await.unwrap();
if b.wait_for(
r#"//div[@class="needsclick cmp-root-container"]"#,
Duration::from_secs(3),
)
.await
{
let script = r#"
return document.querySelector("\#top > div.needsclick.cmp-root-container").shadowRoot.querySelector("\#consentDialog > div.cmp_ui.cmp_ext_text.cmp_state-stacks > div.cmp_navi > div > div.cmp_mainButtons > div > div.cmp_primaryButtonLine > div > div")
"#;
let accept_cookies = b.execute(script, ([]).to_vec()).await.unwrap();
accept_cookies.element().unwrap().click().await.unwrap();
}
let anime_info_section = b
.get_element_by_xpath(r#"//section[@id="information"]"#)
.await
.unwrap();
let title_element = anime_info_section
.find(By::XPath(r#".//div[@class="title"]//strong[@class="f16"]"#))
.await
.unwrap();
info.insert(
"original_title".into(),
text_from!(anime_info_section
.find(By::XPath(r#".//div[@class="title"]//div"#))
.await
.unwrap())
.into(),
);
info.insert("title".into(), text_from!(title_element).into());
let cover_image_url = attr_from!(
anime_info_section
.find(By::XPath(r#".//figure[@id="cover-container"]/img"#))
.await
.unwrap(),
"src"
);
info.insert(
"cover".into(),
handle_media_url(&cover_image_url, "cover", false, conf)
.await
.into(),
);
let mut details: Map<String, Value> = Map::new();
let details_elements = anime_info_section
.find_all(By::XPath(r#"./div/ul/li[2]/ul/li[1]/div"#))
.await
.unwrap();
for element in details_elements {
if attr_from!(element, "class") == "title" {
continue;
}
let (key, val) = split_header(&element).await;
if attr_from!(element, "class") == "creators" {
details.insert(
escape_key(&key),
val.split(", ").collect::<Vec<&str>>().into(),
);
continue;
}
if attr_from!(element, "class") == "websites" {
let mut links: Vec<String> = vec![];
let links_html = element.find_all(By::XPath("./a")).await.unwrap();
for l in links_html {
links.push(attr_from!(l, "href"));
}
details.insert(escape_key(&key), links.into());
continue;
}
details.insert(escape_key(&key), val.into());
}
info.insert("details".into(), details.into());
for desc in b
.get_elements_by_xpath(r#"//section[@id="description"]//button"#)
.await
.unwrap()
{
let desc_lang = attr_from!(desc, "lang");
if try_attr_from!(desc, "class").unwrap_or(String::new()) != "active" {
let show_more_button = b
.get_element_by_xpath(&format!(
r#"//section[@id="description"]//button[@lang="{desc_lang}"]"#
))
.await
.unwrap();
b.scroll_to_element(&show_more_button).await.unwrap();
show_more_button.click().await.unwrap();
}
}
let mut descriptions: Map<String, Value> = Map::new();
for desc in b
.get_elements_by_xpath(
r#"//section[@id="description"]//div[@class="textblock details-text"]"#,
)
.await
.unwrap()
{
let desc_lang = attr_from!(desc, "lang");
let desc_text = text_from!(desc);
descriptions.insert(desc_lang, desc_text.into());
}
info.insert("description".into(), descriptions.into());
let tag_cloud = b
.get_element_by_xpath(r#"//*[@id="description"]//ul[@class="cloud"]"#)
.await
.unwrap();
let mut genres = json!({"main": [], "sub": []}).as_object().unwrap().clone();
let mut tags: Vec<String> = vec![];
for tag in tag_cloud.find_all(By::XPath("./li/a")).await.unwrap() {
if attr_from!(tag, "class") == "gg showpop" {
genres
.get_mut("main")
.unwrap()
.as_array_mut()
.unwrap()
.push(text_from!(tag).into());
}
if attr_from!(tag, "class") == "gc showpop" && !text_from!(tag).is_empty() {
genres
.get_mut("sub")
.unwrap()
.as_array_mut()
.unwrap()
.push(text_from!(tag).into());
}
if attr_from!(tag, "class") == "gt showpop" {
tags.push(text_from!(tag));
}
}
info.insert("genres".into(), genres.into());
info.insert("tags".into(), tags.into());
if let Some(show_more_button) = b
.get_element_by_xpath(r#"//*[@id="information"]/div/ul/li[2]/div/button"#)
.await
{
b.scroll_to_element(&show_more_button).await.unwrap();
show_more_button.click().await.unwrap();
}
let lang_html = b
.get_elements_by_xpath(r#"//*[@id="information"]/div/ul/li[2]/ul/li"#)
.await
.unwrap();
let mut dubs: Map<String, Value> = Map::new();
if let Some(ol) = b.get_element_by_xpath(r#"//div[@class="title"]"#).await {
dubs.insert(attr_from!(ol, "lang"), Value::Object(Map::new()));
}
let mut subs: Map<String, Value> = Map::new();
for dub in lang_html {
let lang_info = dub.find_all(By::XPath("./div")).await.unwrap();
if lang_info.len() != 4 {
continue;
}
let lang_lang = attr_from!(lang_info[0], "lang");
let mut is_dub = false;
if lang_info[0]
.find(By::XPath(r#".//span[@class="speaker"]"#))
.await
.is_ok()
{
is_dub = true;
}
let lang_status = split_header(&lang_info[1]).await;
let lang_release = split_header(&lang_info[2]).await;
let lang_publisher = split_header(&lang_info[3]).await;
let lang_map = json!({
escape_key(&lang_status.0): lang_status.1,
escape_key(&lang_release.0): lang_release.1,
escape_key(&lang_publisher.0): lang_publisher.1
});
if is_dub {
dubs.insert(lang_lang, lang_map);
} else {
subs.insert(lang_lang, lang_map);
}
}
info.insert("dubs".into(), dubs.into());
info.insert("subs".into(), subs.into());
Ok(info)
}
}
#[async_trait]
impl Extractor for AnisearchExtractor {
fn name(&self) -> String {
"ANISEARCH".to_owned()
}
fn supported_hosts(&self) -> Vec<&str> {
vec!["www.anisearch.com"]
}
async fn run_scrape(
&self,
url: Url,
browser: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
if url.path().starts_with("/anime/index") {
self.anime_search(url, browser, conf).await
} else {
self.anime(url, browser, conf).await
}
}
}

228
src/extractors/aur.rs Normal file
View file

@ -0,0 +1,228 @@
use super::prelude::*;
pub struct AURExtractor {}
impl AURExtractor {
pub fn new() -> Self {
Self {}
}
pub async fn aur_package(
&self,
url: Url,
b: &mut crate::Browser,
_conf: &Config,
) -> Result<HashMap<String, Value>, String> {
b.goto(url.to_string()).await.unwrap();
let head = text_from!(b
.get_element_by_xpath(r#"//*[@id="pkgdetails"]/h2"#)
.await
.unwrap())[17..]
.to_owned();
let (name, version) = head.split_once(' ').unwrap();
let pkg_info = b
.get_element_by_xpath(r#"//*[@id="pkginfo"]"#)
.await
.unwrap();
let mut info: HashMap<String, Value> = {
let mut hm = HashMap::new();
hm.insert("name".into(), name.into());
hm.insert("version".into(), version.into());
hm
};
for row in pkg_info.find_all(By::Tag("tr")).await.unwrap() {
match text_from!(row.find(By::Tag("th")).await.unwrap()).as_str() {
"Git Clone URL:" => {
info.insert(
"clone".into(),
attr_from!(row.find(By::Tag("a")).await.unwrap(), "href").into(),
);
}
"Description:" => {
info.insert(
"description".into(),
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
);
}
"Upstream URL:" => {
info.insert(
"upstream".into(),
attr_from!(row.find(By::Tag("a")).await.unwrap(), "href").into(),
);
}
"Keywords:" => {
let keyword_items = row.find_all(By::Tag("a")).await.unwrap();
let mut keywords: Vec<String> = vec![];
for kw in keyword_items {
keywords.push(text_from!(kw));
}
info.insert("keywords".into(), keywords.into());
}
"Licenses:" => {
info.insert(
"license".into(),
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
);
}
"Submitter:" => {
info.insert(
"submitter".into(),
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
);
}
"Maintainer:" => {
info.insert(
"maintainer".into(),
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
);
}
"Last Packager:" => {
info.insert(
"last_packager".into(),
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
);
}
"Votes:" => {
info.insert(
"votes".into(),
text_from!(row.find(By::Tag("td")).await.unwrap())
.parse::<usize>()
.unwrap()
.into(),
);
}
"Popularity:" => {
info.insert(
"popularity".into(),
text_from!(row.find(By::Tag("td")).await.unwrap())
.parse::<f64>()
.unwrap()
.into(),
);
}
"First Submitted:" => {
info.insert(
"first_submitted".into(),
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
);
}
"Last Updated:" => {
info.insert(
"last_updated".into(),
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
);
}
_ => {
log::debug!("unknown column");
}
}
}
let mut dependencies: Vec<Value> = vec![];
let dependency_items = b
.get_element_by_xpath(r#"//*[@id="pkgdepslist"]"#)
.await
.unwrap();
let mut deps = dependency_items.find_all(By::Tag("li")).await.unwrap();
if !deps.is_empty() {
if text_from!(deps.last().unwrap()).contains("Show ") {
b.goto(attr_from!(
deps.last().unwrap().find(By::XPath("./a")).await.unwrap(),
"href"
))
.await
.unwrap();
let dependency_items = b
.get_element_by_xpath(r#"//*[@id="pkgdepslist"]"#)
.await
.unwrap();
deps = dependency_items.find_all(By::Tag("li")).await.unwrap();
}
for dep in deps {
let dep_name = text_from!(dep.find(By::Tag("a")).await.unwrap());
let dep_info =
text_from!(dep.find_all(By::Tag("em")).await.unwrap().last().unwrap());
dependencies.push(json!({
"name": dep_name,
"info": dep_info
}));
}
}
info.insert("dependencies".into(), dependencies.into());
let mut required_by: Vec<Value> = vec![];
let required_by_items = b
.get_element_by_xpath(r#"//*[@id="pkgreqslist"]"#)
.await
.unwrap();
let mut reqs = required_by_items.find_all(By::Tag("li")).await.unwrap();
if !reqs.is_empty() {
if text_from!(reqs.last().unwrap()).contains("Show ") {
b.goto(attr_from!(
reqs.last().unwrap().find(By::XPath("./a")).await.unwrap(),
"href"
))
.await
.unwrap();
let required_by_items = b
.get_element_by_xpath(r#"//*[@id="pkgreqslist"]"#)
.await
.unwrap();
reqs = required_by_items.find_all(By::Tag("li")).await.unwrap();
}
for req in reqs {
let req_name = text_from!(req.find(By::Tag("a")).await.unwrap());
let req_info =
text_from!(req.find_all(By::Tag("em")).await.unwrap().last().unwrap());
required_by.push(json!({
"name": req_name,
"optional": (req_info == "(optional)")
}));
}
}
info.insert("required_by".into(), required_by.into());
let mut sources: Vec<String> = vec![];
for source in b
.get_element_by_xpath(r#"//*[@id="pkgsrcslist"]"#)
.await
.unwrap()
.find_all(By::Tag("li"))
.await
.unwrap()
{
sources.push(attr_from!(source.find(By::Tag("a")).await.unwrap(), "href"));
}
info.insert("sources".into(), sources.into());
Ok(info)
}
}
#[async_trait]
impl Extractor for AURExtractor {
fn name(&self) -> String {
"AUR".to_owned()
}
fn supported_hosts(&self) -> Vec<&str> {
vec!["aur.archlinux.org"]
}
async fn run_scrape(
&self,
url: Url,
browser: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
self.aur_package(url, browser, conf).await
}
}

421
src/extractors/igdb.rs Normal file
View file

@ -0,0 +1,421 @@
use super::prelude::*;
use crate::util::{escape_key, handle_media_url, parse_date, remove_last_n_chars};
pub struct IGDBExtractor {}
impl IGDBExtractor {
pub const fn new() -> Self {
Self {}
}
}
impl IGDBExtractor {
async fn igdb_game(
&self,
url: Url,
b: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
b.goto(url.to_string()).await.unwrap();
let mut info: HashMap<String, Value> = HashMap::new();
info.insert(
"name".to_owned(),
remove_last_n_chars(
&b.get_element_text_by_xpath(r#"//*[@class="gamepage-title-wrapper"]/h1"#)
.await
.ok_or("could not get game name")?,
4,
)
.into(),
);
info.insert(
"id".to_owned(),
b.get_element_text_by_xpath(r#"//*[@class="optimisly-game-maininfo"]/div[1]/span"#)
.await
.ok_or("could not get id")?
.into(),
);
let cover_url = b
.get_element_attr_by_xpath(r#"//*[@class="gamepage-cover"]/img[1]"#, "src")
.await
.ok_or("could not get cover url")?;
info.insert(
"cover".to_owned(),
handle_media_url(
&cover_url,
&format!("igdb-{}-cover", info.get("id").unwrap().as_str().unwrap()),
false,
conf,
)
.await
.into(),
);
let genre_and_platform_htmls = b
.get_elements_by_xpath(
r#"//*[@class="gamepage-tabs"]/div[2]/p/span[@class="text-semibold"]/.."#,
)
.await
.unwrap();
let genres_html = genre_and_platform_htmls.first().unwrap();
let mut genres_html = genres_html
.find_all(thirtyfour::By::Tag("a"))
.await
.ok()
.ok_or("could not get genres")?;
let mut genres: Vec<String> = vec![];
for genre in &mut genres_html {
genres.push(text_from!(genre));
}
info.insert("genre".to_owned(), genres.into());
let platforms_html = genre_and_platform_htmls.get(1).unwrap();
let platforms_txt = text_from!(platforms_html)[11..].to_owned();
let mut platforms: Vec<String> = vec![];
for platform in platforms_txt.split(", ") {
platforms.push(platform.to_owned());
}
info.insert("platforms".to_owned(), platforms.into());
info.insert(
"url".to_owned(),
b.get_element_attr_by_xpath(
r#"//*[@class="gamepage-tabs"]/div[4]/div[@class="input-group"]/input"#,
"value",
)
.await
.ok_or("could not get url")?
.into(),
);
let desc = b
.get_element_by_xpath(r#"//*[@class="gamepage-tabs"]/div[2]/div[1]"#)
.await
.ok_or("could not get description")?;
if let Ok(show_more) = desc.find(thirtyfour::By::Tag("span")).await {
show_more.click().await.unwrap();
}
info.insert("description".to_owned(), text_from!(desc).into());
let date_str = b
.get_element_text_by_xpath(r#"//*[@class="banner-subheading"]/span[1]/span[1]"#)
.await
.ok_or("could not get release date")?;
info.insert(
"release".to_owned(),
if date_str == "TBD" {
Option::<String>::None.into()
} else {
Some(parse_date(&date_str, "%b %d, %Y").ok_or("could not parse release date")?)
.into()
},
);
let mut releases: Vec<Map<String, Value>> = vec![];
let releases_html = b
.find(thirtyfour::By::XPath(
r#"//*[@class="optimisly-game-maininfo"]/div[2]"#,
))
.await
.ok()
.ok_or("could not get releases")?;
for release in releases_html
.find_all(thirtyfour::By::XPath("./*"))
.await
.unwrap()
{
let release_platform = text_from!(release
.find(thirtyfour::By::XPath("./div[1]/span"))
.await
.unwrap());
let release_info_html = release
.find(By::XPath("./div[2]/div[1]/div[1]/span"))
.await
.unwrap();
let release_date = text_from!(release_info_html.find(By::Tag("time")).await.unwrap());
let release_info = text_from!(release_info_html.find(By::Tag("strong")).await.unwrap());
releases.push(
json!({
"platform": release_platform,
"date": release_date,
"info": release_info
})
.as_object()
.unwrap()
.clone(),
);
}
info.insert("releases".into(), releases.into());
let mut developers: Vec<String> = vec![];
let developers_html = b
.get_element_by_xpath(
r#"//*[@class="optimisly-game-maininfo"]/div[@itemprop="author"]/span"#,
)
.await
.ok_or("could not get developers")?;
for dev in developers_html.find_all(By::Tag("a")).await.unwrap() {
developers.push(text_from!(dev));
}
info.insert("developers".into(), developers.into());
let mut publishers: Vec<String> = vec![];
if let Some(publishers_html) = b
.get_element_by_xpath(
r#"//*[@class="optimisly-game-maininfo"]/span[@itemprop="publisher"]/span"#,
)
.await
{
for publ in publishers_html.find_all(By::Tag("a")).await.unwrap() {
publishers.push(text_from!(publ));
}
info.insert("publishers".into(), publishers.into());
} else {
log::warn!("could not get publishers");
}
let mut ratings: Map<String, Value> = Map::new();
let ratings_html = b
.find(By::XPath(r#"//*[@class="gamepage-gauge"]"#))
.await
.ok()
.ok_or("could not get ratings")?;
let ratings_html = {
let mut el: Vec<String> = vec![];
for r in &ratings_html.find_all(By::Tag("text")).await.unwrap() {
el.push(text_from!(r));
}
el
};
let ratings_txt: Vec<String> = ratings_html
.into_iter()
.filter(|x| x.chars().all(char::is_numeric) || x == "N/A")
.collect();
ratings.insert(
"member".into(),
(if ratings_txt[0] == "N/A" {
None
} else {
Some(ratings_txt[0].parse::<usize>().unwrap())
})
.into(),
);
ratings.insert(
"critic".into(),
(if ratings_txt[1] == "N/A" {
None
} else {
Some(ratings_txt[1].parse::<usize>().unwrap())
})
.into(),
);
info.insert("ratings".into(), ratings.into());
let mut ttb: Map<String, Value> = Map::new();
if let Some(ttb_data) = b
.get_element_by_xpath(r#"//*[@id="content-page"]/div[2]/aside/table/tbody"#)
.await
{
for row in ttb_data.find_all(By::Tag("tr")).await.unwrap() {
ttb.insert(
remove_last_n_chars(&text_from!(row.find(By::Tag("th")).await.unwrap()), 1),
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
);
}
info.insert("time_to_beat".into(), ttb.into());
} else {
log::warn!("could not get time to beat");
}
b.scroll_to_end().await.unwrap();
if let Some(show_more) = b.get_element_by_xpath(r#"//*[@id="game-storyline"]/span[@class="text-purple cursor-pointer charLimitMore"]"#).await {
show_more.click().await.unwrap();
}
if let Some(storyline_html) = b
.get_element_text_by_xpath(r#"//*[@id="game-storyline"]/p"#)
.await
{
info.insert("storyline".into(), storyline_html.into());
} else {
log::warn!("could not get storyline");
}
let recommend_div = b
.get_element_by_xpath(r#"//*[@id="content-page"]/div[2]/div[2]/ul/div[2]/div"#)
.await
.unwrap();
let mut recommended: Vec<String> = vec![];
for game in recommend_div.find_all(By::Tag("li")).await.unwrap() {
let game_link = game.find(By::Tag("a")).await.unwrap();
recommended.push(b.get_url_from_link(game_link).await);
}
info.insert("recommendations".into(), recommended.into());
b.scroll_to_end().await.unwrap();
if let Some(show_all_langs) = b
.get_element_by_xpath(r#"//*[@class="language-supports-display"]/button"#)
.await
{
show_all_langs.click().await.unwrap();
}
for el in b.get_elements_by_xpath(r#"//*[@class="optimisly-game-extrainfo2"]/div/div/span[@class="text-purple cursor-pointer"]"#).await.unwrap() {
el.click().await.unwrap();
}
let mut extra_info = String::new();
let extra_info_html = b
.get_element_by_xpath(r#"//*[@class="optimisly-game-extrainfo2"]"#)
.await
.unwrap();
for el in extra_info_html.find_all(By::XPath("./*")).await.unwrap() {
extra_info.push_str(&format!("{}\n", text_from!(el)));
}
let mut extra_map: HashMap<String, Vec<Value>> = HashMap::new();
let mut extra_map_new: HashMap<String, Value> = HashMap::new();
let mut last = String::new();
for line in extra_info.lines() {
let line = line.trim();
if line.is_empty() {
continue;
}
if line.ends_with(':') {
last = remove_last_n_chars(line, 1);
extra_map.insert(last.clone(), vec![]);
} else {
extra_map.get_mut(&last).unwrap().push(line.into());
}
}
for key in extra_map.keys() {
if key == "Localized titles" {
let titles = extra_map.get(key).unwrap();
let mut title_map: Map<String, Value> = Map::new();
for title in titles {
let (lang, val) = title.as_str().unwrap().split_once(": ").unwrap();
title_map.insert(lang.into(), val.into());
}
extra_map_new.insert(key.into(), title_map.into());
}
if key == "Alternative titles" {
let titles = extra_map.get(key).unwrap();
let mut title_map: Map<String, Value> = Map::new();
for title in titles {
let (lang, val) = title.as_str().unwrap().split_once(": ").unwrap();
title_map.insert(lang.into(), val.into());
}
extra_map_new.insert(key.into(), title_map.into());
}
if key == "Keywords" {
let keywords = extra_map.get(key).unwrap()[0].to_string();
extra_map_new.insert(
key.into(),
remove_last_n_chars(&keywords, 1)[1..]
.split(", ")
.map(std::string::ToString::to_string)
.collect(),
);
}
if key == "Supported Languages" {
let mut supported_langs: Map<String, Value> = json!({
"audio": Vec::<String>::new(),
"subtitles": Vec::<String>::new(),
"interface": Vec::<String>::new()
})
.as_object()
.unwrap()
.clone();
let lang_html = b
.get_element_by_xpath(r#"//*[@class="language-supports-display"]/table/tbody"#)
.await
.unwrap();
for lang in lang_html.find_all(By::Tag("tr")).await.unwrap() {
let support = lang.find_all(By::Tag("td")).await.unwrap();
let lang_name = remove_last_n_chars(&text_from!(support[0]), 1);
if text_from!(support[1]) == "" {
supported_langs
.get_mut("audio")
.unwrap()
.as_array_mut()
.unwrap()
.push(lang_name.clone().into());
}
if text_from!(support[2]) == "" {
supported_langs
.get_mut("subtitles")
.unwrap()
.as_array_mut()
.unwrap()
.push(lang_name.clone().into());
}
if text_from!(support[3]) == "" {
supported_langs
.get_mut("interface")
.unwrap()
.as_array_mut()
.unwrap()
.push(lang_name.into());
}
}
extra_map_new.insert(key.into(), supported_langs.into());
}
}
extra_map.remove_entry("Localized titles");
extra_map.remove_entry("Alternative titles");
extra_map.remove_entry("Keywords");
extra_map.remove_entry("Supported Languages");
let extra_map: HashMap<String, Value> = extra_map
.into_iter()
.map(|(old_key, value)| {
let new_key = escape_key(&old_key);
(new_key, value.into())
})
.collect();
let extra_map_new: HashMap<String, Value> = extra_map_new
.into_iter()
.map(|(old_key, value)| {
let new_key = escape_key(&old_key);
(new_key, value)
})
.collect();
info.extend(extra_map);
info.extend(extra_map_new);
Ok(info)
}
}
#[async_trait]
impl Extractor for IGDBExtractor {
fn name(&self) -> String {
"IGDB".to_owned()
}
fn supported_hosts(&self) -> Vec<&str> {
vec!["www.igdb.com"]
}
async fn run_scrape(
&self,
url: Url,
b: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
self.igdb_game(url, b, conf).await
}
}

View file

@ -0,0 +1,159 @@
use crate::util::{currency, escape_key, remove_last_n_chars};
use super::prelude::*;
pub struct MediamarktExtractor {}
impl MediamarktExtractor {
pub fn new() -> Self {
Self {}
}
async fn product(
&self,
url: Url,
b: &mut crate::Browser,
_conf: &Config,
) -> Result<HashMap<String, Value>, String> {
b.goto(url.to_string()).await.unwrap();
let mut info: HashMap<String, Value> = HashMap::new();
info.insert(
"title".into(),
b.get_element_text_by_xpath(r#"//div[@data-test="mms-select-details-header"]/h1"#)
.await
.unwrap()
.into(),
);
let product_info_elements = b
.get_elements_by_xpath(
r#"//div[@data-test="mms-select-details-header"]//p[@font-family="default"]"#,
)
.await
.unwrap();
let re = regex::Regex::new(r"[-+]?\d*\.\d+|\d+").unwrap();
let ratings: Vec<f64> = re
.find_iter(&text_from!(product_info_elements.first().unwrap()))
.map(|m| m.as_str().parse::<f64>().unwrap())
.collect();
info.insert("rating".into(), ratings[0].into());
info.insert("amount_of_ratings".into(), ratings[1].into());
info.insert(
"product_number".into(),
text_from!(product_info_elements[1])
.replace("Art.-Nr. ", "")
.into(),
);
if let Some(discount) = b
.get_element_text_by_xpath(
r#"//div[@data-test="mms-product-price"]//div[@data-test="mms-badge"]/span"#,
)
.await
{
info.insert("discount".into(), discount.into());
}
if let Some(orig_price) = b.get_element_text_by_xpath(r#"//div[@data-test="mms-product-price"]//div[@data-test="mms-badge"]/../p[1]/span[3]"#).await {
let orig_price = format!("{}{}", &orig_price.chars().skip(1).collect::<String>(), &orig_price.chars().take(1).collect::<String>());
info.insert("original_price".into(), currency(&orig_price).into());
}
let price = remove_last_n_chars(
&b.get_element_text_by_xpath(r#"//span[@data-test="branded-price-whole-value"]"#)
.await
.unwrap(),
1,
);
let price = format!(
"{}{}",
&price.chars().skip(2).collect::<String>(),
&price.chars().take(2).collect::<String>()
);
info.insert("price".into(), currency(price.trim()).into());
if let Some(price_decimal) = b
.get_element_text_by_xpath(r#"//span[@data-test="branded-price-decimal-value"]"#)
.await
{
let decimal = if price_decimal == "" {
0.0
} else {
format!("0.{price_decimal}").parse::<f64>().unwrap()
};
let old_v = info
.get_mut("price")
.unwrap()
.as_object_mut()
.unwrap()
.get_mut("value")
.unwrap()
.as_f64()
.unwrap();
info.get_mut("price")
.unwrap()
.as_object_mut()
.unwrap()
.insert("value".into(), (old_v + decimal).into());
}
let mut data_information: Map<String, Value> = Map::new();
let features_html = b
.get_elements_by_xpath(r#"//div[@data-test="pdp-features-content"]/div/div/table"#)
.await
.unwrap();
b.scroll_to_element(features_html.first().unwrap())
.await
.unwrap();
for feature in features_html {
let title = escape_key(&text_from!(feature
.find(By::XPath("./thead//p"))
.await
.unwrap()));
data_information.insert(title.clone(), Value::Object(Map::new()));
b.scroll_to_element(&feature).await.unwrap();
for info in feature.find_all(By::XPath("./tbody/tr")).await.unwrap() {
b.scroll_to_element(&info).await.unwrap();
std::thread::sleep(std::time::Duration::from_millis(50));
let info_html = info.find_all(By::XPath("./td/p")).await.unwrap();
let key = escape_key(&text_from!(info_html[0]));
if key.is_empty() {
continue;
}
let val = text_from!(info_html[1]);
data_information
.get_mut(&title)
.unwrap()
.as_object_mut()
.unwrap()
.insert(key, val.into());
}
}
info.insert("information".into(), data_information.into());
Ok(info)
}
}
#[async_trait]
impl Extractor for MediamarktExtractor {
fn name(&self) -> String {
"MEDIAMARKT".to_owned()
}
fn supported_hosts(&self) -> Vec<&str> {
vec!["www.mediamarkt.de"]
}
async fn run_scrape(
&self,
url: Url,
b: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
self.product(url, b, conf).await
}
}

158
src/extractors/mod.rs Normal file
View file

@ -0,0 +1,158 @@
use std::collections::HashMap;
use async_trait::async_trait;
use chrono::Utc;
use serde_json::Value;
use url::Url;
mod amazon;
mod anilist;
mod anisearch;
mod aur;
mod igdb;
mod mediamarkt;
mod postman;
mod steam;
mod tmdb;
use crate::Config;
mod prelude {
pub use super::Extractor;
pub use crate::Config;
pub use crate::{attr_from, text_from, try_attr_from};
pub use async_trait::async_trait;
pub use serde_json::{json, Map, Value};
pub use std::collections::HashMap;
pub use std::time::Duration;
pub use thirtyfour::By;
pub use url::Url;
}
#[async_trait]
/// A trait for defining custom extractors to scrape data from web pages.
pub trait Extractor {
/// Checks if the provided URL can be handled by this extractor.
///
/// # Arguments
///
/// * `url` - The URL to be matched against the extractor's capabilities.
///
/// # Returns
///
/// Returns `true` if the extractor can handle the given URL, otherwise `false`
fn match_url(&self, url: &Url) -> bool {
if let Some(host_str) = url.host_str() {
if self.supported_hosts().contains(&host_str) {
return true;
}
}
false
}
/// Retrieves supported hosts for the extractor
///
/// # Returns
///
/// Returns a `Vec` of hosts supported by this extractor.
fn supported_hosts(&self) -> Vec<&str>;
/// Retrieves the name of the extractor.
///
/// # Returns
///
/// Returns a `String` containing the name of the extractor.
fn name(&self) -> String;
/// Performs the web scraping operation on the provided URL using the given browser
/// instance and configuration.
///
/// # Arguments
///
/// * `url` - The URL to perform scraping on.
/// * `browser` - A mutable reference to the browser instance used for scraping.
/// * `conf` - A reference to the configuration settings for scraping.
///
/// # Returns
///
/// Returns a `Result` indicating either a successful scraping operation with a
/// `HashMap` containing extracted data, or an error message as a `String` if the
/// operation fails.
async fn run_scrape(
&self,
url: Url,
browser: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String>;
}
/// Get a list of all extractors registered.
#[must_use]
pub fn get_extractors() -> Vec<Box<dyn Extractor>> {
vec![
Box::new(igdb::IGDBExtractor::new()),
Box::new(aur::AURExtractor::new()),
Box::new(amazon::AmazonExtractor::new()),
Box::new(anilist::AnilistExtractor::new()),
Box::new(anisearch::AnisearchExtractor::new()),
Box::new(mediamarkt::MediamarktExtractor::new()),
Box::new(postman::PostmanExtractor::new()),
Box::new(steam::SteamExtractor::new()),
Box::new(tmdb::TmdbExtractor::new()),
]
}
pub async fn scrape_url(url: &str, conf: &Config) {
let p_url = Url::parse(url).expect("Invalid URL");
let ts = Utc::now();
let mut data: Option<Result<HashMap<String, Value>, String>> = None;
let mut browser = crate::Browser::new(conf).await.unwrap();
if let Some(force_ext) = &conf.force_extractor {
let extractors = get_extractors();
let ex = extractors
.iter()
.find(|x| x.name() == *force_ext)
.unwrap()
.to_owned();
log::info!("Scraping '{}'", p_url.to_string());
log::info!("Using extractor {}", ex.name());
data = Some(ex.run_scrape(p_url, &mut browser, conf).await);
} else {
for x in get_extractors() {
if x.match_url(&p_url) {
log::info!("Scraping '{}'", p_url.to_string());
log::info!("Using extractor {}", x.name());
data = Some(x.run_scrape(p_url, &mut browser, conf).await);
break;
}
}
}
browser.quit().await;
if data.is_none() {
log::error!("Site not supported");
std::process::exit(1);
}
let data = data.unwrap();
let mut data = match data {
Ok(data) => data,
Err(e) => {
let mut h = HashMap::new();
log::error!("Scrape failed: {e}");
h.insert("error".into(), e.into());
h
}
};
if conf.save_ts {
data.insert("scraped_at".to_string(), ts.timestamp_nanos().into());
}
println!("{}", serde_json::to_string(&data).unwrap());
}

322
src/extractors/postman.rs Normal file
View file

@ -0,0 +1,322 @@
use crate::util::{escape_unsafe_characters, handle_media_url, remove_last_n_chars, window};
use super::prelude::*;
pub struct PostmanExtractor {}
impl PostmanExtractor {
pub fn new() -> Self {
Self {}
}
}
impl PostmanExtractor {
async fn torrent(
&self,
url: Url,
b: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
b.goto(url.to_string()).await.unwrap();
let mut info: HashMap<String, Value> = HashMap::new();
let info_table_html = b
.get_element_by_xpath(r#"//*[@id="td_props"]/tbody"#)
.await
.ok_or("could not get info table")?;
let entries = info_table_html.find_all(By::Tag("tr")).await.unwrap();
for entry in entries.iter().take(entries.len() - 1) {
if let Ok(key_name_el) = entry.find(By::XPath(r#"./td[@class="label"]/b"#)).await {
let key_name = text_from!(key_name_el);
let content = text_from!(entry.find(By::XPath("./td[2]")).await.unwrap());
match key_name.as_str() {
"Name:" => {
info.insert("name".into(), content.into());
}
"Torrent file:" => {
let torrent_file_url = b
.get_url_from_link(entry.find(By::XPath("./td[2]/a[1]")).await.unwrap())
.await;
info.insert(
"torrent_file".into(),
handle_media_url(&torrent_file_url, &content, true, conf)
.await
.into(),
);
}
"Magnet:" => {
info.insert(
"magnet_url".into(),
attr_from!(
entry.find(By::XPath("./td[2]/a[1]")).await.unwrap(),
"href"
)
.into(),
);
}
"Infohash:" => {
info.insert("infohash".into(), content.into());
}
"Size:" => {
info.insert("size".into(), content.into());
}
"Owner:" => {
if content != "hidden" && content != "none (abandoned torrent)" {
info.insert("owner".into(), content.into());
let level = attr_from!(
entry
.find(By::XPath("./td[2]/span[1]/img[1]"))
.await
.unwrap(),
"src"
);
info.insert(
"owner_level".into(),
remove_last_n_chars(level.split('/').last().unwrap(), 4)
.parse::<isize>()
.unwrap()
.into(),
);
}
}
"Main Languages:" => {
let languages_html =
entry.find_all(By::XPath("./td[2]/span")).await.unwrap();
let mut languages: Vec<String> = vec![];
for lang in languages_html {
languages.push(attr_from!(lang, "title"));
}
info.insert("main_languages".into(), languages.into());
}
"Subtitle Languages:" => {
let languages_html =
entry.find_all(By::XPath("./td[2]/span")).await.unwrap();
let mut languages: Vec<String> = vec![];
for lang in languages_html {
languages.push(attr_from!(lang, "title"));
}
info.insert("subtitle_languages".into(), languages.into());
}
"Hits / Downloads:" => {
let (hits_amount, downloads_amount) = content.split_once(" / ").unwrap();
info.insert(
"hits_amount".into(),
hits_amount.parse::<isize>().unwrap().into(),
);
info.insert(
"downloads_amount".into(),
downloads_amount.parse::<isize>().unwrap().into(),
);
}
"Seeders / Leechers:" => {
let (seeders_amount, leechers_amount) = content.split_once(" / ").unwrap();
info.insert(
"seeders_amount".into(),
seeders_amount.parse::<isize>().unwrap().into(),
);
info.insert(
"leechers_amount".into(),
leechers_amount.parse::<isize>().unwrap().into(),
);
}
"Added / Last Active:" => {
let (added_timestamp, last_active_timestamp) =
content.split_once(" / ").unwrap();
info.insert("added_timestamp".into(), added_timestamp.into());
info.insert(
"last_active_timestamp".into(),
(if last_active_timestamp == "No active seeders in DB" {
None
} else {
Some(last_active_timestamp)
})
.into(),
);
}
"Rating:" => {
info.insert(
"rating".into(),
attr_from!(
entry
.find(By::XPath(r#"./td[2]/span[@id="ratingbars"]"#))
.await
.unwrap(),
"title"
)
.split_whitespace()
.next()
.unwrap()
.parse::<f64>()
.unwrap()
.into(),
);
}
"Description:" => {
info.insert("description".into(), content.into());
}
"Category:" => {
info.insert("category".into(), content.into());
}
"Subtitles:" => {
if !content.is_empty() {
info.insert("subtitles".into(), content.into());
}
}
"Length:" => {
if !content.is_empty() {
info.insert("length".into(), content.into());
}
}
"Genre:" => {
if !content.is_empty() {
info.insert("genre".into(), content.into());
}
}
"Codec:" => {
if !content.is_empty() {
info.insert("codec".into(), content.into());
}
}
"Ripper Info:" => {
if !content.is_empty() {
info.insert("ripper_info".into(), content.into());
}
}
"Format:" => {
if !content.is_empty() {
info.insert("format".into(), content.into());
}
}
"Bitrate:" => {
if !content.is_empty() {
info.insert("bitrate".into(), content.into());
}
}
"Banned:" => {
info.insert("banned".into(), (content == "yes").into());
}
"Immutable:" => {
info.insert("immutable".into(), (content == "yes").into());
}
"Visible:" => {
info.insert("visible".into(), (content == "yes").into());
}
"Comment Handling:" => {}
_ => {
log::debug!("unknown key {key_name}");
}
}
}
}
let mut files: Vec<Map<String, Value>> = vec![];
let files_info_html: Vec<_> = b
.get_elements_by_xpath(r#"//*[@id="td_files"]/tbody/*"#)
.await
.ok_or("could not get files info")?
.into_iter()
.skip(1)
.collect();
for entry in files_info_html {
let file_name = text_from!(entry.find(By::XPath("./td[1]")).await.unwrap());
let file_size = text_from!(entry.find(By::XPath("./td[2]")).await.unwrap());
files.push(
json!({
"file_name": file_name,
"file_size": file_size
})
.as_object()
.unwrap()
.clone(),
);
}
info.insert("files".into(), files.into());
let mut attachments: Map<String, Value> = Map::new();
let attachment_html = b
.get_element_by_xpath(r#"//table[@id="td_attachments"]/tbody/tr[1]/td[1]"#)
.await
.ok_or("could not get attachments")?;
for el in attachment_html
.find_all(By::XPath("./a/img"))
.await
.unwrap()
{
let attachment_title = attr_from!(el, "title");
let attachment_url = b.get_absolute_url(&attr_from!(el, "src")).await;
attachments.insert(
attachment_title.clone(),
handle_media_url(
&attachment_url,
&format!("{}.png", escape_unsafe_characters(&attachment_title)),
true,
conf,
)
.await
.into(),
);
}
info.insert("attachments".into(), attachments.into());
if let Some(comments_html) = b.get_element_by_xpath(r#"//*[@id="comments"]/tbody"#).await {
let mut comments: Vec<Map<String, Value>> = vec![];
for comment in window(&comments_html.find_all(By::Tag("tr")).await.unwrap(), 2) {
let comment_user =
text_from!(comment[0].find(By::XPath("./th/b/i")).await.unwrap());
let comment_ts = text_from!(comment[0]
.find(By::XPath(r#"./th/span[@class="commentdate"]"#))
.await
.unwrap())[7..]
.to_owned();
let mut comment_content = text_from!(comment[1]
.find(By::XPath(r#"./td/span[@class="commenttext"]"#))
.await
.unwrap());
let comment_content_html = comment[1]
.find_all(By::XPath(r#"./td/span[@class="commenttext"]/a"#))
.await
.unwrap();
for el in comment_content_html {
if el.tag_name().await.unwrap().as_str() == "a" {
let link = format!("[{}]({})", text_from!(el), attr_from!(el, "href"));
comment_content = comment_content.replace(&text_from!(el), &link);
}
}
comments.push(
json!({
"user": comment_user,
"timestamp": comment_ts,
"content": comment_content
})
.as_object()
.unwrap()
.clone(),
);
}
info.insert("comments".into(), comments.into());
}
Ok(info)
}
}
#[async_trait]
impl Extractor for PostmanExtractor {
fn name(&self) -> String {
"POSTMAN".to_owned()
}
fn supported_hosts(&self) -> Vec<&str> {
vec!["tracker2.postman.i2p"]
}
async fn run_scrape(
&self,
url: Url,
b: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
self.torrent(url, b, conf).await
}
}

145
src/extractors/steam.rs Normal file
View file

@ -0,0 +1,145 @@
use crate::util::{currency, parse_date};
use super::prelude::*;
pub struct SteamExtractor;
impl SteamExtractor {
pub fn new() -> Self {
Self {}
}
async fn steam_game(
&self,
url: Url,
b: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
let mut url = url;
let lang = conf.language.clone();
match lang {
crate::Language::de_DE => {
url.query_pairs_mut().append_pair("l", "german");
}
crate::Language::en_US => {
url.query_pairs_mut().append_pair("l", "english");
}
}
log::info!(
"Changing to '{url}' because of {:?} language",
conf.language
);
b.goto(url.to_string()).await.unwrap();
if b.current_url().await.unwrap().path().contains("agecheck") {
log::info!("Game is behind age restriction");
let year = b
.get_element_by_xpath(r#"//*[@id="ageYear"]"#)
.await
.unwrap();
thirtyfour::components::SelectElement::new(&year)
.await
.unwrap()
.select_by_value("1900")
.await
.unwrap();
b.click_on_xpath(r#"//*[@id="view_product_page_btn"]"#)
.await;
b.wait_for(r#"//*[@id="appHubAppName"]"#, Duration::from_secs(5))
.await;
b.goto(url.to_string()).await.unwrap();
}
let game_name = b
.get_element_text_by_xpath(r#"//*[@id="appHubAppName"]"#)
.await
.unwrap();
let game_description = b
.get_element_text_by_xpath(r#"//*[@class="game_description_snippet"]"#)
.await
.unwrap();
let game_release = b
.get_element_text_by_xpath(r#"//*[@class="release_date"]/div[2]"#)
.await
.unwrap();
let game_release = match lang {
crate::Language::de_DE => parse_date(&game_release, "%d. %b. %Y").unwrap(),
crate::Language::en_US => parse_date(&game_release, "%d %b, %Y").unwrap(),
};
let game_developer = b
.get_element_text_by_xpath(r#"//*[@id="developers_list"]/a"#)
.await
.unwrap();
let game_publisher = text_from!(b
.get_elements_by_xpath(r#"//*[@class="dev_row"]/div[2]"#)
.await
.unwrap()
.get(1)
.unwrap());
let mut game_price: Option<Map<String, Value>> = None;
if let Some(game_orig_price_html) = b.get_elements_by_xpath(r#"//*[@class="game_area_purchase_game_wrapper"]/div/div[2]/div/div[1]/div[2]/*[@class="discount_original_price"]"#).await.unwrap().first() {
let game_orig_price = text_from!(game_orig_price_html);
let game_discount_price = text_from!(b.get_elements_by_xpath(r#"//*[@class="game_area_purchase_game_wrapper"]/div/div[2]/div/div[1]/div[2]/*[@class="discount_final_price"]"#).await.unwrap().first().unwrap());
game_price = Some(json!({
"original_price": currency(&game_orig_price),
"discount_price": currency(&game_discount_price),
}).as_object().unwrap().clone());
} else if let Some(game_price_html) = b.get_element_text_by_xpath(r#"//*[@class="game_area_purchase_game_wrapper"]/div/div[2]/div/*[@class="game_purchase_price price"]"#).await {
game_price = Some(currency(&game_price_html));
} else {
let check_free_price = b.get_element_text_by_xpath(r#"//*[@class="game_purchase_action"]/div[1]/div[@class="game_purchase_price price"]"#).await.unwrap();
match lang {
crate::Language::de_DE => {
if check_free_price == "Kostenlos" {
game_price = Some(currency("0.0€"));
}
},
crate::Language::en_US => {
if check_free_price == "Free" {
game_price = Some(currency("0.0$"));
}
}
}
}
let mut info = HashMap::new();
info.insert("name".into(), game_name.into());
info.insert("description".into(), game_description.into());
info.insert("release".into(), game_release.into());
info.insert("developer".into(), game_developer.into());
info.insert("publisher".into(), game_publisher.into());
info.insert("price".into(), game_price.into());
Ok(info)
}
}
#[async_trait]
impl Extractor for SteamExtractor {
fn supported_hosts(&self) -> Vec<&str> {
vec!["store.steampowered.com"]
}
fn name(&self) -> String {
"STEAM".to_string()
}
async fn run_scrape(
&self,
url: Url,
browser: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
self.steam_game(url, browser, conf).await
}
}

338
src/extractors/tmdb.rs Normal file
View file

@ -0,0 +1,338 @@
use crate::util::{
escape_key, extract_attrs_from_elements, extract_texts_from_elements, handle_media_url,
parse_date, remove_last_n_chars,
};
use super::prelude::*;
pub struct TmdbExtractor;
impl TmdbExtractor {
pub fn new() -> Self {
Self {}
}
async fn series(
&self,
url: Url,
b: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
b.goto(url.to_string()).await.unwrap();
let prefs = json!({
"i18n_fallback_language": "en-US",
"locale": "en-US",
"country_code": "US",
});
let prefs_str = serde_json::to_string(&prefs).unwrap();
let lang_cookie =
thirtyfour::Cookie::build("tmdb.prefs", urlencoding::encode(&prefs_str).into_owned())
.domain("www.themoviedb.org")
.path("/")
.expires(None)
.secure(true)
.http_only(true)
.same_site(thirtyfour::cookie::SameSite::Lax)
.finish();
b.delete_cookie("tmdb.prefs").await.unwrap();
b.add_cookie(lang_cookie).await.unwrap();
b.refresh().await.unwrap();
let mut info: HashMap<String, Value> = HashMap::new();
if b.get_element_by_xpath(r#"//*[@id="main"]//div[@class="error_wrapper"]"#)
.await
.is_some()
{
Err("page unavailable")?;
}
info.insert(
"title".into(),
b.get_element_text_by_xpath(
r#"//*[@id="original_header"]//section[@class="header poster"]/div/h2/a"#,
)
.await
.unwrap()
.into(),
);
info.insert(
"release_year".into(),
remove_last_n_chars(
&b.get_element_text_by_xpath(
r#"//*[@id="original_header"]//section[@class="header poster"]/div/h2/span"#,
)
.await
.unwrap()[1..],
1,
)
.into(),
);
let rating_html = attr_from!(
b.get_element_by_xpath(r#"//div[@class="user_score_chart"]/div[1]/span"#)
.await
.unwrap(),
"class"
);
let rating = rating_html.split("icon-r").nth(1).unwrap();
info.insert(
"user_rating".into(),
rating.parse::<isize>().unwrap().into(),
);
if let Some(age_certification) = b
.get_element_text_by_xpath(
r#"//*[@id="original_header"]//span[@class="certification"]"#,
)
.await
{
info.insert("age_certification".into(), age_certification.into());
}
let genres_html = b
.get_elements_by_xpath(r#"//*[@id="original_header"]//span[@class="genres"]/a"#)
.await
.unwrap();
info.insert(
"genres".into(),
extract_texts_from_elements(genres_html).await.into(),
);
info.insert(
"overview".into(),
b.get_element_text_by_xpath(r#"//*[@id="original_header"]//div[@class="overview"]"#)
.await
.unwrap()
.into(),
);
let cover_url = b
.get_absolute_url(
&attr_from!(
b.get_element_by_xpath(
r#"//*[@id="original_header"]//div[@class="poster"]//img"#
)
.await
.unwrap(),
"src"
)
.replace("_filter(blur)", ""),
)
.await;
info.insert(
"cover".into(),
handle_media_url(&cover_url, "cover", false, conf)
.await
.into(),
);
for fact in b
.get_elements_by_xpath(r#"//*[@id="media_v4"]//section[@class="facts left_column"]/p"#)
.await
.unwrap()
{
if let Ok(key) = fact.find(By::XPath("./strong")).await {
let key = text_from!(key);
if key == "Networks" {
continue;
}
info.insert(
escape_key(&key),
text_from!(fact).replace(&format!("{key}\n"), "").into(),
);
}
}
let mut tags: Vec<String> = vec![];
for tag in b
.get_elements_by_xpath(
r#"//*[@id="media_v4"]//section[@class="keywords right_column"]/ul[1]/li"#,
)
.await
.unwrap()
{
tags.push(text_from!(tag.find(By::XPath("./a")).await.unwrap()));
}
info.insert("tags".into(), tags.into());
let all_seasons_url = b
.get_url_from_link(
b.get_element_by_xpath(
r#"//*[@id="media_v4"]//section[@class="panel season"]/p[1]/a"#,
)
.await
.unwrap(),
)
.await;
b.goto(all_seasons_url).await.unwrap();
let mut seasons: Vec<Map<String, Value>> = vec![];
let mut seasons_urls: Vec<String> = vec![];
for s in extract_attrs_from_elements(
b.get_elements_by_xpath(
r#"//*[@id="media_v4"]//div[@class="season_wrapper"]/section/div/a"#,
)
.await
.unwrap(),
"href",
)
.await
{
seasons_urls.push(b.get_absolute_url(&s).await);
}
for season in seasons_urls {
let season_data = self.season_page(season, b, conf).await?;
seasons.push(season_data);
}
info.insert("seasons".into(), seasons.into());
Ok(info)
}
async fn season_page(
&self,
url: String,
b: &mut crate::Browser,
_conf: &Config,
) -> Result<Map<String, Value>, String> {
b.goto(url.clone()).await.unwrap();
let mut season: Map<String, Value> = Map::new();
season.insert(
"season_number".into(),
url.split('/')
.last()
.unwrap()
.parse::<isize>()
.unwrap()
.into(),
);
season.insert(
"title".into(),
b.get_element_text_by_xpath(
r#"//*[@id="main"]//span[@class="flex poster"]/span/div/h2/a"#,
)
.await
.unwrap()
.into(),
);
if let Some(release_year) = b
.get_element_text_by_xpath(
r#"//*[@id="main"]//span[@class="flex poster"]/span/div/h2/span"#,
)
.await
{
season.insert(
"release_year".into(),
remove_last_n_chars(&release_year[1..], 1).into(),
);
}
season.insert(
"amount_of_episodes".into(),
b.get_element_text_by_xpath(
r#"//*[@id="main_column"]//h3[@class="episode_sort space"]/span"#,
)
.await
.unwrap()
.parse::<isize>()
.unwrap()
.into(),
);
let mut episodes: Vec<Map<String, Value>> = vec![];
let episodes_html = b
.get_element_by_xpath(r#"//*[@id="main_column"]//div[@class="episode_list"]"#)
.await
.unwrap();
for e in episodes_html
.find_all(By::XPath(r#"./div[@class="card"]"#))
.await
.unwrap()
{
let mut episode: Map<String, Value> = Map::new();
episode.insert(
"episode_number".into(),
text_from!(e
.find(By::XPath(r#".//span[@class="episode_number"]"#))
.await
.unwrap())
.parse::<isize>()
.unwrap()
.into(),
);
episode.insert(
"title".into(),
text_from!(e
.find(By::XPath(r#".//div[@class="episode_title"]//a"#))
.await
.unwrap())
.into(),
);
episode.insert(
"rating".into(),
text_from!(e
.find(By::XPath(
r#".//div[@class="episode_title"]/div[1]/div[1]/div[1]"#
))
.await
.unwrap())
.parse::<f64>()
.unwrap()
.into(),
);
if let Ok(release_date) = e
.find(By::XPath(
r#".//div[@class="episode_title"]//div[@class="date"]/span[@class="date"]"#,
))
.await
{
episode.insert(
"release_date".into(),
parse_date(&text_from!(release_date), "%B %d, %Y").into(),
);
}
episode.insert("runtime".into(), text_from!(e.find(By::XPath(r#".//div[@class="episode_title"]//div[@class="date"]/span[@class="runtime"]"#)).await.unwrap()).into());
episode.insert(
"overview".into(),
text_from!(e
.find(By::XPath(
r#".//div[@class="info"]//div[@class="overview"]/p"#
))
.await
.unwrap())
.into(),
);
episodes.push(episode);
}
season.insert("episodes".into(), episodes.into());
Ok(season)
}
}
#[async_trait]
impl Extractor for TmdbExtractor {
fn supported_hosts(&self) -> Vec<&str> {
vec!["www.themoviedb.org"]
}
fn name(&self) -> String {
"TMDB".to_string()
}
async fn run_scrape(
&self,
url: Url,
browser: &mut crate::Browser,
conf: &Config,
) -> Result<HashMap<String, Value>, String> {
self.series(url, browser, conf).await
}
}

356
src/lib.rs Normal file
View file

@ -0,0 +1,356 @@
use std::{
ops::Deref,
process::{Child, Command, Stdio},
str::FromStr,
};
use strum::EnumVariantNames;
pub mod extractors;
pub mod util;
use thirtyfour::prelude::*;
/// A convenience macro for extracting text content from a web element expression.
///
/// This macro takes a single expression `$expr` that represents a web element. It uses
/// the `.text()` method to asynchronously extract text content from the web element and
/// immediately unwraps the result. This macro is useful for simplifying the process of
/// extracting text content from web elements.
#[macro_export]
macro_rules! text_from {
($expr:expr) => {
$expr.text().await.unwrap()
};
}
/// A convenience macro for extracting an attribute's value from a web element expression.
///
/// This macro takes two expressions as arguments: `$expr`, which represents a web element,
/// and `$attr`, which is the name of the attribute to extract. It uses the `.attr()` method
/// to asynchronously extract the value of the specified attribute from the web element and
/// immediately unwraps the result. This macro simplifies the process of attribute extraction.
///
/// # Note
///
/// If the attribute is not present, the macro will panic when trying to unwrap it.
#[macro_export]
macro_rules! attr_from {
($expr:expr, $attr:expr) => {
$expr.attr($attr).await.unwrap().unwrap()
};
}
/// A convenience macro for attempting to extract an attribute's value from a web element expression.
///
/// This macro takes two expressions as arguments: `$expr`, which represents a web element,
/// and `$attr`, which is the name of the attribute to extract. It uses the `.attr()` method
/// to asynchronously extract the value of the specified attribute from the web element. If the
/// attribute is not present, it returns an `Option` with `None`.
///
/// # Note
///
/// This macro returns an `Option` containing the attribute value, which can be either `Some(value)`
/// or `None` if the attribute is absent.
#[macro_export]
macro_rules! try_attr_from {
($expr:expr, $attr:expr) => {
$expr.attr($attr).await.unwrap()
};
}
/// A struct representing a web browser instance.
pub struct Browser {
driver: Option<WebDriver>,
cmd: Child,
}
impl Browser {
pub async fn new(conf: &Config) -> Option<Self> {
// TODO : setup http proxy
let mut caps = DesiredCapabilities::chrome();
let child = Command::new("chromedriver")
.args(vec!["-p", "9515"])
.stdout(Stdio::null())
.stderr(Stdio::null())
.spawn()
.ok()?;
if let Some(http_proxy) = conf.http_proxy.clone() {
let (_host, _port) = http_proxy.split_once(':').unwrap();
caps.add_chrome_arg(&format!("--proxy-server={http_proxy}"))
.unwrap();
}
let driver = WebDriver::new("http://localhost:9515", caps).await.ok()?;
Some(Self {
driver: Some(driver),
cmd: child,
})
}
/// Scrolls to the end of the web page using the browser's `WebDriver`.
///
/// This asynchronous method is used to scroll to the bottom of a web page by executing a JavaScript
/// script using the browser's `WebDriver`. It takes no arguments and returns a `Result` containing
/// either a `ScriptRet` indicating the script execution result or a `WebDriverError` if an error occurs.
///
/// # Returns
///
/// A `Result` containing either a `ScriptRet` indicating the script execution result or a `WebDriverError`.
pub async fn scroll_to_end(&self) -> Result<ScriptRet, WebDriverError> {
self.driver
.as_ref()
.unwrap()
.execute(
"window.scrollTo(0, document.body.scrollHeight);",
Vec::new(),
)
.await
}
/// Clicks on a web element using its `XPath` expression.
///
/// This asynchronous method is used to locate a web element using its `XPath` expression,
/// and then perform a click action on it.
///
/// The method does not return any value but panics if the operation fails.
///
/// # Arguments
///
/// * `xpath` - The `XPath` expression used to locate the web element.
pub async fn click_on_xpath(&self, xpath: &str) {
self.get_element_by_xpath(xpath)
.await
.unwrap()
.click()
.await
.unwrap();
}
/// Scrolls to a specific web element using the browser's `WebDriver`.
///
/// This asynchronous method is used to scroll to a specific web element by executing a JavaScript
/// script using the browser's `WebDriver`.
///
/// The method returns a `Result` containing either a `ScriptRet` indicating the script execution
/// result or a `WebDriverError` if an error occurs.
///
/// # Arguments
///
/// * `e` - A reference to the web element to scroll into view.
///
/// # Returns
///
/// A `Result` containing either a `ScriptRet` indicating the script execution result or a `WebDriverError`.
pub async fn scroll_to_element(&self, e: &WebElement) -> Result<ScriptRet, WebDriverError> {
self.driver
.as_ref()
.unwrap()
.execute("arguments[0].scrollIntoView();", vec![e.to_json()?])
.await
}
/// Retrieves a complete URL from a link element using the browser's `WebDriver`.
///
/// This asynchronous method is used to retrieve a complete URL from a web element
/// by extracting its "href" attribute and appending it to the base URL of the current page.
///
/// # Arguments
///
/// * `el` - The web element from which to extract the URL.
///
/// # Returns
///
/// A string representing the complete URL derived from the link element.
pub async fn get_url_from_link(&self, el: WebElement) -> String {
let url = attr_from!(el, "href");
self.get_absolute_url(&url).await
}
/// Converts a URL to an absolute URL based on the current page's URL.
///
/// This function takes a relative or absolute `url` as input and returns the
/// corresponding absolute URL. If the input `url` is a relative URL, it is converted
/// to an absolute URL using the current page's URL as the base. If the input `url` is
/// already an absolute URL, it is returned as is.
///
/// # Parameters
///
/// - `url`: A string slice representing the relative or absolute URL to be converted.
///
/// # Returns
///
/// A `String` containing the absolute URL.
pub async fn get_absolute_url(&self, url: &str) -> String {
if let Err(url::ParseError::RelativeUrlWithoutBase) = url::Url::parse(url) {
let mut current_url = self.current_url().await.unwrap();
current_url.set_query(None);
current_url.set_path(url);
return current_url.to_string();
}
url.to_string()
}
/// Waits for a web element to be present using the browser's `WebDriver`.
///
/// This asynchronous method is used to wait for a web element to be present in the
/// DOM using its `XPath` expression. It takes an `XPath` string and a timeout duration
/// as arguments and performs the following actions:
///
/// 1. Queries for the web element using the provided `XPath` expression.
/// 2. Waits for the web element to exist within the specified timeout duration.
/// 3. Returns `true` if the web element is found within the timeout, otherwise `false`.
///
/// # Arguments
///
/// * `xpath` - The `XPath` expression used to locate the web element.
/// * `timeout` - The maximum duration to wait for the web element to appear.
///
/// # Returns
///
/// A boolean value indicating whether the web element was found within the timeout.
pub async fn wait_for(&self, xpath: &str, timeout: std::time::Duration) -> bool {
self.driver
.as_ref()
.unwrap()
.query(By::XPath(xpath))
.wait(timeout, std::time::Duration::new(0, 500))
.exists()
.await
.unwrap_or(false)
}
/// Retrieves the text content of a web element using its `XPath` expression.
///
/// This asynchronous method is used to locate a web element using its `XPath` expression
/// and retrieve its text content.
/// # Arguments
///
/// * `xpath` - The `XPath` expression used to locate the web element.
///
/// # Returns
///
/// An `Option` containing the text content of the web element, or `None` if not found.
pub async fn get_element_text_by_xpath(&self, xpath: &str) -> Option<String> {
self.find(By::XPath(xpath)).await.ok()?.text().await.ok()
}
/// Retrieves the value of a specific attribute from a web element using its `XPath` expression.
///
/// This asynchronous method is used to locate a web element using its `XPath` expression
/// and retrieve the value of a specific attribute.
/// # Arguments
///
/// * `xpath` - The `XPath` expression used to locate the web element.
/// * `attr` - The name of the attribute to retrieve.
///
/// # Returns
///
/// An `Option` containing the value of the specified attribute, or `None` if not found.
pub async fn get_element_attr_by_xpath(&self, xpath: &str, attr: &str) -> Option<String> {
self.find(By::XPath(xpath))
.await
.ok()?
.attr(attr)
.await
.ok()?
}
/// Retrieves a list of web elements using their `XPath` expression.
///
/// This asynchronous method is used to locate multiple web elements using a common `XPath` expression.
/// # Arguments
///
/// * `xpath` - The `XPath` expression used to locate the web elements.
///
/// # Returns
///
/// An `Option` containing a vector of located web elements, or `None` if none are found.
pub async fn get_elements_by_xpath(&self, xpath: &str) -> Option<Vec<WebElement>> {
self.find_all(By::XPath(xpath)).await.ok()
}
/// Retrieves a single web element using its `XPath` expression.
///
/// This asynchronous method is used to locate a single web element using its `XPath` expression.
/// # Arguments
///
/// * `xpath` - The `XPath` expression used to locate the web element.
///
/// # Returns
///
/// An `Option` containing the located web element, or `None` if not found.
pub async fn get_element_by_xpath(&self, xpath: &str) -> Option<WebElement> {
self.find(By::XPath(xpath)).await.ok()
}
/// Quits the browser instance and `WebDriver` process.
///
/// This asynchronous method is used to gracefully quit the browser instance and the associated `WebDriver` process.
pub async fn quit(mut self) {
let b = self.driver.take().unwrap();
b.quit().await.unwrap();
self.cmd.kill().unwrap();
}
}
impl Deref for Browser {
type Target = WebDriver;
fn deref(&self) -> &Self::Target {
self.driver.as_ref().unwrap()
}
}
#[allow(non_camel_case_types)]
#[derive(Debug, EnumVariantNames, Clone)]
pub enum Language {
en_US,
de_DE,
}
impl FromStr for Language {
type Err = ();
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"en_US" => Ok(Language::en_US),
"de_DE" => Ok(Language::de_DE),
_ => Err(()),
}
}
}
impl ToString for Language {
fn to_string(&self) -> String {
match self {
Language::en_US => "en_US".to_string(),
Language::de_DE => "de_DE".to_string(),
}
}
}
pub struct Config {
/// Save a timestamp alongside the scraped data
pub save_ts: bool,
/// Set the desired language for the extractor
pub language: Language,
/// Download media urls to disk
pub download_media: bool,
/// URL of the HTTP Proxy to use
pub http_proxy: Option<String>,
/// Embed media urls as data urls
pub embed_media: bool,
/// Force a specific extractor
pub force_extractor: Option<String>,
}
impl Default for Config {
fn default() -> Self {
Self {
save_ts: false,
language: Language::en_US,
download_media: false,
http_proxy: None,
embed_media: false,
force_extractor: None,
}
}
}

128
src/main.rs Normal file
View file

@ -0,0 +1,128 @@
use clap::{App, Arg};
use scrape::{Config, Language};
use std::{io::Write, str::FromStr};
use strum::VariantNames;
#[must_use]
pub fn cli_args() -> clap::ArgMatches<'static> {
App::new("Web Scraper")
/* .arg(Arg::with_name("sites")
.long("sites")
.takes_value(false)
.multiple(false)
.help("Show all supported sites"))*/
.arg(
Arg::with_name("t")
.short("t")
.long("timestamp")
.help("Store timestamp when scraping"),
)
.arg(
Arg::with_name("d")
.short("d")
.long("download")
.help("Download any found media urls"),
)
.arg(
Arg::with_name("lang")
.long("lang")
.takes_value(true)
.required(false)
.possible_values(Language::VARIANTS)
.default_value("en_US")
.help("Desired language to scrape in"),
)
.arg(
Arg::with_name("http-proxy")
.long("http-proxy")
.takes_value(true)
.help("HTTP Proxy"),
)
.arg(
Arg::with_name("extractor")
.long("extractor")
.help("Force specific extractor")
.possible_values(
&scrape::extractors::get_extractors()
.iter()
.map(|x| x.name())
.collect::<Vec<String>>()
.iter()
.map(std::string::String::as_str)
.collect::<Vec<&str>>(),
)
.takes_value(true)
.required(false),
)
.arg(
Arg::with_name("e")
.short("e")
.long("embed-media")
.help("Embed media urls as data urls"),
)
.arg(
Arg::with_name("url")
.required(true)
.index(1)
.help("URL to scrape"),
)
.get_matches()
}
fn setup_logger() {
let mut logger = env_logger::builder();
#[cfg(debug_assertions)]
logger.filter_level(log::LevelFilter::Trace);
#[cfg(not(debug_assertions))]
logger.filter_level(log::LevelFilter::Info);
logger
.format(|buf, record| {
use log::Level;
let level = record.level();
let color = match level {
Level::Error => "\x1b[31m",
Level::Warn => "\x1b[33m",
Level::Info => "\x1b[32m",
Level::Debug => "\x1b[34m",
Level::Trace => "\x1b[35m",
};
writeln!(
buf,
"{}{}\x1b[0m [{}]: {}",
color,
record.metadata().level().to_string().to_uppercase(),
record.metadata().target(),
record.args()
)
})
.init();
}
#[tokio::main]
async fn main() {
setup_logger();
let matches = cli_args();
let http_proxy = matches.value_of("http-proxy");
let url = matches.value_of("url").unwrap();
let conf = Config {
save_ts: matches.is_present("t"),
language: Language::from_str(matches.value_of("lang").unwrap()).expect("unknown language"),
download_media: matches.is_present("d"),
http_proxy: if http_proxy.is_some() {
Some(http_proxy.unwrap().to_owned())
} else {
None
},
embed_media: matches.is_present("e"),
force_extractor: matches
.value_of("extractor")
.map(std::string::ToString::to_string),
};
scrape::extractors::scrape_url(url, &conf).await;
}

395
src/util.rs Normal file
View file

@ -0,0 +1,395 @@
use std::{io::Write, process::Stdio};
use base64::Engine;
use regex::Regex;
use thirtyfour::WebElement;
use crate::{attr_from, text_from, Config};
/// Removes the last `n` characters from the given string `input` and returns a new `String`
/// containing the modified content.
///
/// This function takes a reference to a string `input` and an unsigned integer `n`. It then
/// creates a new string containing all characters of `input` except for the last `n` characters.
/// The resulting modified string is returned.
///
/// # Arguments
///
/// * `input` - The input string from which characters will be removed.
/// * `n` - The number of characters to remove from the end of the string.
///
/// # Returns
///
/// A new `String` containing the modified content with the last `n` characters removed.
///
/// # Examples
///
/// ```
/// use scrape::util::remove_last_n_chars;
///
/// let input = "example";
/// let modified = remove_last_n_chars(input, 3);
/// assert_eq!(modified, "exam");
/// ```
#[must_use]
pub fn remove_last_n_chars(input: &str, n: usize) -> String {
input[..input.len() - n].to_string()
}
/// Extracts overlapping windows of a specified size from a vector.
///
/// Given a vector `lst` and a window `size`, this function extracts overlapping
/// windows of the specified size from the vector. The windows are extracted in a
/// stride-like fashion, where every `size` elements are skipped before extracting
/// the next window.
///
/// # Parameters
///
/// - `lst`: A vector of elements from which windows will be extracted.
/// - `size`: The size of each window.
///
/// # Returns
///
/// A vector of vectors, where each inner vector represents an extracted window
/// of elements from the input vector.
///
/// # Examples
///
/// ```rust
/// use scrape::util::window;
///
/// let input: Vec<&str> = vec!["r", "u", "s", "t"];
/// let out = window(input, 2);
/// assert_eq!(out, vec![vec!["r", "u"], vec!["s", "t"]]);
///
/// let input: Vec<&str> = vec!["a", "b", "c", "d", "e", "f", "g", "h", "i"];
/// let out = window(input, 3);
/// assert_eq!(out, vec![vec!["a", "b", "c"], vec!["d", "e", "f"], vec!["g", "h", "i"]]);
/// ```
pub fn window<T: Clone>(lst: &[T], size: usize) -> Vec<Vec<T>> {
let mut result = Vec::new();
let mut wd = lst.windows(size);
for i in 0..wd.len() {
let window = wd.next().unwrap();
if i % size != 0 {
continue;
}
result.push(window.to_owned());
}
result
}
/// Escapes unsafe characters in the given filename and returns a new `String` with
/// the unsafe characters replaced by underscores.
///
/// This function takes a reference to a filename string and scans it for characters that
/// are considered unsafe in filenames, such as `<`, `>`, `:`, `"`, `/`, `\`, `|`, `?`, `*`,
/// control characters (0x00-0x1F), and DEL (0x7F). It then replaces all occurrences of such
/// unsafe characters with underscores (`_`) in the filename and returns the modified string.
///
/// The function uses the `regex` crate to perform the replacement.
///
/// # Arguments
///
/// * `filename` - The filename string containing unsafe characters.
///
/// # Returns
///
/// A new `String` with unsafe characters replaced by underscores.
///
/// # Examples
///
/// ```
/// use scrape::util::escape_unsafe_characters;
///
/// let filename = "hello/world?.txt";
/// let escaped = escape_unsafe_characters(filename);
/// assert_eq!(escaped, "hello_world_.txt");
/// ```
#[must_use]
pub fn escape_unsafe_characters(filename: &str) -> String {
let unsafe_chars = r#"[<>:"/\\|?*\x00-\x1F\x7F]"#;
let re = Regex::new(unsafe_chars).unwrap();
re.replace_all(filename, "_").to_string()
}
/// Parses a date string using the provided format and returns the parsed date
/// in the "YYYY-MM-DD" format as a `String`.
///
/// This function takes a reference to a date string and a format string that specifies
/// the expected format of the input date. It attempts to parse the input date using the
/// given format and returns an `Option<String>` containing the parsed date in the
/// "YYYY-MM-DD" format if the parsing is successful. If parsing fails, `None` is returned.
///
/// The function uses the `chrono` crate to handle date parsing and formatting.
///
/// # Arguments
///
/// * `date` - The input date string to be parsed.
/// * `format` - The format string specifying the expected format of the input date.
///
/// # Returns
///
/// An `Option<String>` containing the parsed date in "YYYY-MM-DD" format if parsing is successful,
/// otherwise `None`.
///
/// # Examples
///
/// ```
/// use scrape::util::parse_date;
///
/// let date_str = "20-08-2023";
/// let format_str = "%d-%m-%Y";
/// let parsed = parse_date(date_str, format_str);
/// assert_eq!(parsed, Some(String::from("2023-08-20")));
/// ```
#[must_use]
pub fn parse_date(date: &str, format: &str) -> Option<String> {
let date = chrono::NaiveDate::parse_from_str(date, format).ok()?;
Some(date.format("%Y-%m-%d").to_string())
}
/// Handles a media URL based on the provided configuration, downloading, saving, and
/// potentially embedding the media content as a data URL.
///
/// This asynchronous function takes a reference to a URL, a file name, a boolean flag
/// indicating whether to use the raw file name, and a reference to a `Config` instance.
///
/// # Arguments
///
/// * `url` - The URL of the media content.
/// * `file_name` - The desired file name for saving the media content.
/// * `raw_file_name` - A flag indicating whether to use the raw file name.
/// * `conf` - A reference to a `Config` instance containing configuration settings.
///
/// # Returns
///
/// A `String` representing the Data URL or the original URL.
pub async fn handle_media_url(
url: &str,
file_name: &str,
raw_file_name: bool,
conf: &Config,
) -> String {
let file_name = escape_unsafe_characters(file_name);
if conf.download_media || conf.embed_media {
let data = download(url, conf).await;
if data.is_err() {
log::error!("Downloading '{url}' failed");
return url.to_string();
}
let data = data.unwrap();
if conf.download_media {
if raw_file_name {
save_raw(url, &data, &file_name);
} else {
save(url, &file_name, &data);
}
}
if conf.embed_media {
return to_data_url(&data);
}
}
url.to_string()
}
/// Escapes a given string by replacing spaces with underscores and converting it to lowercase.
///
/// This function takes a reference to a string `s` and performs the following actions:
///
/// 1. Replaces all occurrences of space characters (' ') with underscores ('_').
/// 2. Converts the entire string to lowercase.
///
/// The modified string is then returned.
///
/// # Arguments
///
/// * `s` - The input string to be escaped.
///
/// # Returns
///
/// A new `String` with spaces replaced by underscores and converted to lowercase.
///
/// # Examples
///
/// ```
/// use scrape::util::escape_key;
///
/// let original = "Hello World";
/// let escaped = escape_key(original);
/// assert_eq!(escaped, "hello_world");
/// ```
#[must_use]
pub fn escape_key(s: &str) -> String {
s.replace(' ', "_").to_lowercase()
}
/// Converts binary data into a data URL string.
///
/// This function takes a reference to a slice of bytes `data` and performs the following actions:
///
/// The `file` command is used to determine the MIME type of the data by reading from stdin.
///
/// # Arguments
///
/// * `data` - The binary data to be converted to a data URL.
///
/// # Returns
///
/// A `String` containing the data URL.
#[must_use]
pub fn to_data_url(data: &[u8]) -> String {
let mut file_cmd = std::process::Command::new("file")
.arg("--mime-type")
.arg("-")
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.spawn()
.unwrap();
{
let mut stdin = file_cmd.stdin.take().unwrap();
stdin.write_all(data).unwrap();
}
let out = file_cmd.wait_with_output().expect("gpg executable error");
let stdout = String::from_utf8_lossy(&out.stdout).to_string();
let mime_type = remove_last_n_chars(&stdout.replace("/dev/stdin: ", ""), 1);
let base64_data = base64::engine::general_purpose::STANDARD.encode(data);
format!("data:{mime_type};base64,{base64_data}")
}
/// Downloads content from the provided URL using the given configuration.
///
/// # Arguments
///
/// * `url` - The URL from which to download content.
/// * `conf` - A reference to a `Config` instance containing optional proxy settings.
///
/// # Returns
///
/// A `Result` containing either the downloaded content as a `Vec<u8>` or an error message.
pub async fn download(url: &str, conf: &crate::Config) -> Result<Vec<u8>, String> {
let mut client_builder = reqwest::Client::builder();
if conf.http_proxy.is_some() {
let proxy = reqwest::Proxy::http(conf.http_proxy.clone().unwrap())
.ok()
.ok_or("could not create proxy")?;
client_builder = client_builder.proxy(proxy);
}
let user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36";
let client = client_builder
.user_agent(user_agent)
.build()
.ok()
.ok_or("could not create client")?;
let resp = client.get(url).send().await.ok().ok_or("request failed")?;
if resp.status() == reqwest::StatusCode::OK {
let data = resp
.bytes()
.await
.ok()
.ok_or("could not get response body")?;
return Ok(data.to_vec());
}
Err(format!("Request failed with Status {}", resp.status()))
}
fn save_raw(url: &str, data: &[u8], file_name: &str) {
match std::fs::write(file_name, data) {
Ok(()) => {
log::info!("Saved '{url}' to '{file_name}'");
}
Err(e) => {
log::error!("Error saving '{url}': {e:?}");
}
}
}
fn save(url: &str, file_name: &str, data: &[u8]) {
let p_url = url::Url::parse(url).unwrap();
let path_segments: Vec<_> = p_url.path_segments().unwrap().collect();
let file_ending = (*path_segments.last().unwrap_or(&"")).to_string();
let file_name = format!("{file_name}.{file_ending}");
save_raw(url, data, &file_name);
}
/// Extracts text content from a collection of web elements asynchronously.
///
/// # Arguments
///
/// * `v` - A `Vec<WebElement>` from which to extract text content.
///
/// # Returns
///
/// A `Vec<String>` containing the extracted text content from the `WebElement`.
pub async fn extract_texts_from_elements(v: Vec<WebElement>) -> Vec<String> {
let mut ret: Vec<_> = vec![];
for e in v {
ret.push(text_from!(e));
}
ret
}
/// Extracts an attribute from a collection of web elements asynchronously.
///
/// # Arguments
///
/// * `v` - A `Vec<WebElement>` from which to extract attribute.
///
/// # Returns
///
/// A `Vec<String>` containing the extracted attribute from the `WebElement`.
pub async fn extract_attrs_from_elements(v: Vec<WebElement>, attr: &str) -> Vec<String> {
let mut ret: Vec<_> = vec![];
for e in v {
ret.push(attr_from!(e, attr));
}
ret
}
/// Parses a string containing a currency value and symbol into a JSON map.
///
/// This function takes a reference to a string `v` representing a currency value along
/// with its symbol (e.g., "$123.45").
///
/// # Arguments
///
/// * `v` - The input string containing a currency value and symbol.
///
/// # Returns
///
/// A JSON map with keys "currency" and "value" representing the currency symbol and value, respectively.
///
/// # Examples
///
/// ```rust
/// use scrape::util::currency;
///
/// let currency_str = "123.45$";
/// let json_map = currency(currency_str);
/// assert_eq!(json_map.get("currency").unwrap().as_str().unwrap(), "$");
/// assert_eq!(json_map.get("value").unwrap().as_f64().unwrap(), 123.45);
/// ```
#[must_use]
pub fn currency(v: &str) -> serde_json::Map<String, serde_json::Value> {
let re = Regex::new(r"^([\d,.]+)([^\d,.]+)$").unwrap();
let captures = re.captures(v).unwrap();
let value_str = captures.get(1).unwrap().as_str().replace(',', ".");
let value = value_str.parse::<f64>().unwrap();
let currency_symbol = captures.get(2).unwrap().as_str().to_string();
let mut result: serde_json::Map<String, serde_json::Value> = serde_json::Map::new();
result.insert("currency".to_string(), currency_symbol.into());
result.insert("value".to_string(), value.into());
result
}