init
This commit is contained in:
commit
72a8357548
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
/target
|
||||
.vscode
|
1835
Cargo.lock
generated
Normal file
1835
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
23
Cargo.toml
Normal file
23
Cargo.toml
Normal file
|
@ -0,0 +1,23 @@
|
|||
[package]
|
||||
name = "scrape"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
thirtyfour = "0.31.0"
|
||||
tokio = { version = "1.32.0", features = ["full"] }
|
||||
clap = "2.33"
|
||||
strum = { version = "0.21", features = ["derive"] }
|
||||
regex = "1.9.3"
|
||||
chrono = "0.4.26"
|
||||
url = "2.4.0"
|
||||
serde = "1.0.183"
|
||||
serde_json = "1.0.105"
|
||||
async-trait = "0.1.73"
|
||||
log = "0.4"
|
||||
env_logger = "0.10"
|
||||
base64 = "0.21.2"
|
||||
reqwest = { version = "0.11.18" }
|
||||
urlencoding = "2.1.3"
|
3
README.md
Normal file
3
README.md
Normal file
|
@ -0,0 +1,3 @@
|
|||
# scrape
|
||||
|
||||
Scrape is a tool to scrape websites and turn their data into JSON. Like yt-dlp, but for web scraping.
|
59
docs/add-new-extractor.md
Normal file
59
docs/add-new-extractor.md
Normal file
|
@ -0,0 +1,59 @@
|
|||
# Adding a new Extractor
|
||||
|
||||
## Create a New Extractor Source File
|
||||
Create a new Rust source file inside `src/extractors`.
|
||||
|
||||
`src/extractors/myext.rs`:
|
||||
```rust
|
||||
use super::prelude::*;
|
||||
|
||||
pub struct MySiteExtractor;
|
||||
|
||||
impl MySiteExtractor {
|
||||
pub fn new() -> Self { Self {} }
|
||||
}
|
||||
```
|
||||
|
||||
## Implement the Extractor Trait
|
||||
Implement the Extractor trait by providing the required methods: `supported_hosts`, `name`, and `run_scrape`.
|
||||
```rust
|
||||
#[async_trait]
|
||||
impl Extractor for MySiteExtractor {
|
||||
fn supported_hosts(&self) -> Vec<&str> {
|
||||
vec!["my-site.com"]
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
"My Site Extractor".to_string()
|
||||
}
|
||||
|
||||
async fn run_scrape(
|
||||
&self,
|
||||
url: Url,
|
||||
browser: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
// scraping logic
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Register the Extractor
|
||||
In `src/extractors/mod.rs` add the following:
|
||||
|
||||
```rust
|
||||
pub mod myext;
|
||||
|
||||
[...]
|
||||
|
||||
#[must_use]
|
||||
pub fn get_extractors() -> Vec<Box<dyn Extractor>> {
|
||||
vec![
|
||||
...
|
||||
// Add your new extractor
|
||||
Box::new(myext::MySiteExtractor::new()),
|
||||
...
|
||||
]
|
||||
}
|
||||
```
|
123
src/extractors/amazon.rs
Normal file
123
src/extractors/amazon.rs
Normal file
|
@ -0,0 +1,123 @@
|
|||
use crate::{
|
||||
util::{currency, escape_key},
|
||||
Language,
|
||||
};
|
||||
|
||||
use super::prelude::*;
|
||||
|
||||
pub struct AmazonExtractor {}
|
||||
|
||||
impl AmazonExtractor {
|
||||
pub fn new() -> Self {
|
||||
Self {}
|
||||
}
|
||||
|
||||
pub async fn amazon_product(
|
||||
&self,
|
||||
url: Url,
|
||||
b: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
let mut url = url;
|
||||
match conf.language {
|
||||
Language::en_US => {
|
||||
url.query_pairs_mut().append_pair("language", "en_GB");
|
||||
b.goto(url.to_string()).await.unwrap();
|
||||
}
|
||||
_ => {
|
||||
url.query_pairs_mut()
|
||||
.append_pair("language", &conf.language.to_string());
|
||||
b.goto(url.to_string()).await.unwrap();
|
||||
}
|
||||
}
|
||||
log::info!(
|
||||
"Changing to '{url}' because of {:?} language",
|
||||
conf.language
|
||||
);
|
||||
|
||||
let mut info: HashMap<String, Value> = HashMap::new();
|
||||
|
||||
b.click_on_xpath(r#"//*[@id="sp-cc-accept"]"#).await;
|
||||
|
||||
info.insert(
|
||||
"product_title".into(),
|
||||
b.get_element_text_by_xpath(r#"//*[@id="productTitle"]"#)
|
||||
.await
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
|
||||
info.insert(
|
||||
"star_rating".into(),
|
||||
b.get_element_text_by_xpath(
|
||||
r#"//*[@class="reviewCountTextLinkedHistogram noUnderline"]/span[1]/a/span"#,
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
.replace(',', ".")
|
||||
.parse::<f64>()
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
|
||||
let price_symbol = b
|
||||
.get_element_text_by_xpath(
|
||||
r#"//*[@id="corePrice_feature_div"]//span[@class="a-price-symbol"]"#,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let whole = b
|
||||
.get_element_text_by_xpath(
|
||||
r#"//*[@id="corePrice_feature_div"]//span[@class="a-price-whole"]"#,
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
.replace(['.', ','], "");
|
||||
let fraction = b
|
||||
.get_element_text_by_xpath(
|
||||
r#"//*[@id="corePrice_feature_div"]//span[@class="a-price-fraction"]"#,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
info.insert(
|
||||
"price".into(),
|
||||
currency(&format!("{whole}.{fraction}{price_symbol}")).into(),
|
||||
);
|
||||
|
||||
let mut tech_details: Map<String, Value> = Map::new();
|
||||
let tech_details_html = b
|
||||
.get_element_by_xpath(r#"//*[@id="productDetails_techSpec_section_1"]/tbody"#)
|
||||
.await
|
||||
.unwrap();
|
||||
for detail in tech_details_html.find_all(By::Tag("tr")).await.unwrap() {
|
||||
let key = text_from!(detail.find(By::Tag("th")).await.unwrap());
|
||||
let value = text_from!(detail.find(By::Tag("td")).await.unwrap());
|
||||
tech_details.insert(escape_key(&key), value.into());
|
||||
}
|
||||
|
||||
info.insert("technical_details".into(), tech_details.into());
|
||||
|
||||
Ok(info)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Extractor for AmazonExtractor {
|
||||
fn name(&self) -> String {
|
||||
"AMAZON".to_owned()
|
||||
}
|
||||
|
||||
fn supported_hosts(&self) -> Vec<&str> {
|
||||
vec!["www.amazon.de"]
|
||||
}
|
||||
|
||||
async fn run_scrape(
|
||||
&self,
|
||||
url: Url,
|
||||
browser: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
self.amazon_product(url, browser, conf).await
|
||||
}
|
||||
}
|
343
src/extractors/anilist.rs
Normal file
343
src/extractors/anilist.rs
Normal file
|
@ -0,0 +1,343 @@
|
|||
use regex::Regex;
|
||||
|
||||
use crate::util::{escape_key, extract_texts_from_elements, handle_media_url};
|
||||
|
||||
use super::prelude::*;
|
||||
|
||||
pub struct AnilistExtractor {}
|
||||
|
||||
impl AnilistExtractor {
|
||||
pub fn new() -> Self {
|
||||
Self {}
|
||||
}
|
||||
|
||||
async fn anime(
|
||||
&self,
|
||||
url: Url,
|
||||
b: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
b.goto(url.to_string()).await.unwrap();
|
||||
|
||||
let mut info: HashMap<String, Value> = HashMap::new();
|
||||
|
||||
if b.wait_for(r#"//span[text()="AGREE"]"#, Duration::from_secs(5))
|
||||
.await
|
||||
{
|
||||
b.click_on_xpath(r#"//span[text()="AGREE"]"#).await;
|
||||
}
|
||||
|
||||
if b.wait_for(
|
||||
r#"//div[@class="header"]//div[@class="content"]/h1"#,
|
||||
Duration::from_secs(5),
|
||||
)
|
||||
.await
|
||||
{
|
||||
info.insert(
|
||||
"title".into(),
|
||||
b.get_element_text_by_xpath(r#"//div[@class="header"]//div[@class="content"]/h1"#)
|
||||
.await
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
|
||||
info.insert(
|
||||
"cover".into(),
|
||||
handle_media_url(
|
||||
&b.get_element_attr_by_xpath(
|
||||
r#"//div[@class="header"]//div[@class="cover-wrap-inner"]/img"#,
|
||||
"src",
|
||||
)
|
||||
.await
|
||||
.unwrap(),
|
||||
"cover",
|
||||
false,
|
||||
conf,
|
||||
)
|
||||
.await
|
||||
.into(),
|
||||
);
|
||||
|
||||
if let Some(desc_show_more) = b
|
||||
.get_element_by_xpath(r#"//span[@class="description-length-toggle"]"#)
|
||||
.await
|
||||
{
|
||||
b.scroll_to_element(&desc_show_more).await.unwrap();
|
||||
desc_show_more.click().await.unwrap();
|
||||
}
|
||||
|
||||
info.insert(
|
||||
"description".into(),
|
||||
b.get_element_text_by_xpath(r#"//div[@class="header"]//p[@class="description"]"#)
|
||||
.await
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
|
||||
let data_html = b
|
||||
.get_element_by_xpath(r#"//div[@class="data"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for data_field in data_html.find_all(By::XPath("./div")).await.unwrap() {
|
||||
let key = escape_key(&text_from!(data_field
|
||||
.find(By::XPath(r#"./div[@class="type"]"#))
|
||||
.await
|
||||
.unwrap()));
|
||||
let val = data_field
|
||||
.find(By::XPath(r#"./*[@class="value"]"#))
|
||||
.await
|
||||
.unwrap();
|
||||
let val_list = val.find_all(By::XPath("./span")).await.unwrap();
|
||||
let val_list = extract_texts_from_elements(val_list).await;
|
||||
// TODO : parse data fields
|
||||
if val_list.is_empty() {
|
||||
info.insert(key, text_from!(val).into());
|
||||
} else {
|
||||
info.insert(key, val_list.into());
|
||||
}
|
||||
}
|
||||
|
||||
let mut tags: Vec<String> = vec![];
|
||||
let tags_html = b
|
||||
.get_element_by_xpath(r#"//div[@class="tags"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
for tag_html in tags_html
|
||||
.find_all(By::XPath(r#"./div[@class="tag"]"#))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
tags.push(text_from!(tag_html
|
||||
.find(By::XPath(r#"./a[1]"#))
|
||||
.await
|
||||
.unwrap()));
|
||||
}
|
||||
info.insert("tags".into(), tags.into());
|
||||
|
||||
let mut websites: Map<String, Value> = Map::new();
|
||||
let websites_html = b
|
||||
.get_element_by_xpath(r#"//div[@class="external-links"]/div"#)
|
||||
.await
|
||||
.unwrap();
|
||||
for website in websites_html.find_all(By::XPath("./a")).await.unwrap() {
|
||||
let web_link = attr_from!(website, "href");
|
||||
let mut web_name = text_from!(website
|
||||
.find(By::XPath(r#"./span[@class="name"]"#))
|
||||
.await
|
||||
.unwrap());
|
||||
let language = match website
|
||||
.find(By::XPath(r#".//span[@class="language"]"#))
|
||||
.await
|
||||
{
|
||||
Ok(el) => Some(text_from!(el)),
|
||||
Err(_) => None,
|
||||
};
|
||||
if language.is_some() {
|
||||
web_name = web_name.replace(&format!(" {}", language.clone().unwrap()), "");
|
||||
}
|
||||
websites.insert(
|
||||
web_name,
|
||||
json!({
|
||||
"url": web_link,
|
||||
"language": language
|
||||
}),
|
||||
);
|
||||
}
|
||||
info.insert("websites".into(), websites.into());
|
||||
|
||||
let mut relations: Vec<Value> = vec![];
|
||||
let relations_html = b
|
||||
.get_element_by_xpath(r#"//div[@class="relations"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
for relation_html in relations_html
|
||||
.find_all(By::XPath(r#"./div/div"#))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
let relation_info = text_from!(relation_html
|
||||
.find(By::XPath(r#"./div[@class="content"]/div[@class="info"]"#))
|
||||
.await
|
||||
.unwrap());
|
||||
let (rel_type, rel_status) = relation_info.split_once(" · ").unwrap();
|
||||
let relation = json!({
|
||||
"url": b.get_url_from_link(relation_html.find(By::XPath("./a")).await.unwrap()).await,
|
||||
"kind": text_from!(relation_html.find(By::XPath(r#"./div[@class="content"]/div[@class="info-header"]/div"#)).await.unwrap()),
|
||||
"title": text_from!(relation_html.find(By::XPath(r#"./div[@class="content"]/a[@class="title"]"#)).await.unwrap()),
|
||||
"type": rel_type,
|
||||
"status": rel_status
|
||||
});
|
||||
relations.push(relation);
|
||||
}
|
||||
info.insert("relations".into(), relations.into());
|
||||
|
||||
let recommendations_html = b
|
||||
.get_element_by_xpath(r#"//div[@class="recommendations"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
let show_all_button = recommendations_html
|
||||
.find(By::XPath(r#".//div[@class="view-all"]/div"#))
|
||||
.await
|
||||
.unwrap();
|
||||
b.scroll_to_element(&show_all_button).await.unwrap();
|
||||
show_all_button.click().await.unwrap();
|
||||
|
||||
let mut recommendations: Map<String, Value> = Map::new();
|
||||
for rec_html in recommendations_html
|
||||
.find_all(By::XPath(r#"./div/div[@class="recommendation-card"]/a"#))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
let rec_url = b.get_url_from_link(rec_html.clone()).await;
|
||||
let rec_title = text_from!(rec_html);
|
||||
recommendations.insert(rec_title, rec_url.into());
|
||||
}
|
||||
info.insert("recommendations".into(), recommendations.into());
|
||||
|
||||
let mut nav_urls: Vec<String> = vec![];
|
||||
for nav in b
|
||||
.get_elements_by_xpath(r#"//div[@class="nav"]/a"#)
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
nav_urls.push(attr_from!(nav, "href"));
|
||||
}
|
||||
|
||||
for nav_url in nav_urls {
|
||||
if nav_url.ends_with("characters") {
|
||||
info.insert(
|
||||
"characters".into(),
|
||||
self.characters_page(&nav_url, b, conf).await?.into(),
|
||||
);
|
||||
}
|
||||
if nav_url.ends_with("staff") {
|
||||
info.insert(
|
||||
"staff".into(),
|
||||
self.staff_page(&nav_url, b, conf).await?.into(),
|
||||
);
|
||||
}
|
||||
if nav_url.ends_with("stats") {
|
||||
info.insert(
|
||||
"stats".into(),
|
||||
self.stats_page(&nav_url, b, conf).await?.into(),
|
||||
);
|
||||
}
|
||||
// todo : reviews?
|
||||
}
|
||||
} else {
|
||||
Err("Scrape failed")?;
|
||||
}
|
||||
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
async fn characters_page(
|
||||
&self,
|
||||
_url: &str,
|
||||
_b: &mut crate::Browser,
|
||||
_conf: &Config,
|
||||
) -> Result<Map<String, Value>, String> {
|
||||
// todo : character page
|
||||
Ok(Map::new())
|
||||
}
|
||||
async fn staff_page(
|
||||
&self,
|
||||
_url: &str,
|
||||
_b: &mut crate::Browser,
|
||||
_conf: &Config,
|
||||
) -> Result<Map<String, Value>, String> {
|
||||
// todo : staff page
|
||||
Ok(Map::new())
|
||||
}
|
||||
async fn stats_page(
|
||||
&self,
|
||||
url: &str,
|
||||
b: &mut crate::Browser,
|
||||
_conf: &Config,
|
||||
) -> Result<Map<String, Value>, String> {
|
||||
let mut data = Map::new();
|
||||
|
||||
b.goto(url).await.unwrap();
|
||||
|
||||
if b.wait_for(r#"//div[@class="rankings graph"]"#, Duration::from_secs(5))
|
||||
.await
|
||||
{
|
||||
let mut ranking = Map::new();
|
||||
|
||||
for rank in b.get_elements_by_xpath(r#"//div[@class="rankings graph"]/a[@class="ranking popular"]/span[@class="rank-text"]"#).await.unwrap() {
|
||||
let re = Regex::new(r"#(\d+)").unwrap();
|
||||
|
||||
if let Some(matched) = re.find(&rank.text().await.unwrap()) {
|
||||
if let Ok(rank_num) = matched.as_str()[1..].parse::<i32>() {
|
||||
let rank_key = text_from!(rank).replacen(matched.as_str(), "", 1).trim().to_string();
|
||||
ranking.insert(escape_key(&rank_key), rank_num.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for rank in b.get_elements_by_xpath(r#"//div[@class="rankings graph"]/a[@class="ranking rated"]/span[@class="rank-text"]"#).await.unwrap() {
|
||||
let re = Regex::new(r"#(\d+)").unwrap();
|
||||
|
||||
if let Some(matched) = re.find(&rank.text().await.unwrap()) {
|
||||
if let Ok(rank_num) = matched.as_str()[1..].parse::<i32>() {
|
||||
let rank_key = text_from!(rank).replacen(matched.as_str(), "", 1).trim().to_string();
|
||||
ranking.insert(escape_key(&rank_key), rank_num.into());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
data.insert("ranking".into(), ranking.into());
|
||||
|
||||
let mut viewer_status: Map<String, Value> = Map::new();
|
||||
if b.wait_for(r#"//div[@class="status-distribution content-wrap"]/div[@class="statuses"]/div[@class="status"]"#, Duration::from_secs(5)).await {
|
||||
b.scroll_to_element(&b.get_element_by_xpath(r#"//div[@class="status-distribution content-wrap"]/div[@class="statuses"]/div[@class="status"]"#).await.unwrap()).await.unwrap();
|
||||
}
|
||||
let viewer_status_dist = b.get_elements_by_xpath(r#"//div[@class="status-distribution content-wrap"]/div[@class="statuses"]/div[@class="status"]"#).await.unwrap();
|
||||
for status in viewer_status_dist {
|
||||
b.scroll_to_element(&status).await.unwrap();
|
||||
let key = text_from!(status
|
||||
.find(By::XPath(r#"./div[@class="name"]"#))
|
||||
.await
|
||||
.unwrap());
|
||||
if !key.is_empty() {
|
||||
let val: isize = text_from!(status
|
||||
.find(By::XPath(r#"./div[@class="amount"]"#))
|
||||
.await
|
||||
.unwrap())
|
||||
.replace(" Users", "")
|
||||
.parse()
|
||||
.unwrap();
|
||||
viewer_status.insert(escape_key(&key), val.into());
|
||||
}
|
||||
}
|
||||
data.insert("viewer_status".into(), viewer_status.into());
|
||||
|
||||
// TODO : not complete
|
||||
} else {
|
||||
Err("Scrape failed")?;
|
||||
}
|
||||
|
||||
Ok(data)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Extractor for AnilistExtractor {
|
||||
fn name(&self) -> String {
|
||||
"ANILIST".to_owned()
|
||||
}
|
||||
|
||||
fn supported_hosts(&self) -> Vec<&str> {
|
||||
vec!["anilist.co"]
|
||||
}
|
||||
|
||||
async fn run_scrape(
|
||||
&self,
|
||||
url: Url,
|
||||
browser: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
self.anime(url, browser, conf).await
|
||||
}
|
||||
}
|
289
src/extractors/anisearch.rs
Normal file
289
src/extractors/anisearch.rs
Normal file
|
@ -0,0 +1,289 @@
|
|||
use thirtyfour::WebElement;
|
||||
|
||||
use crate::util::{escape_key, handle_media_url, remove_last_n_chars};
|
||||
|
||||
use super::prelude::*;
|
||||
|
||||
async fn split_header(el: &WebElement) -> (String, String) {
|
||||
let header = text_from!(el
|
||||
.find(By::XPath(r#".//span[@class="header"]"#))
|
||||
.await
|
||||
.unwrap());
|
||||
let value = text_from!(el);
|
||||
let value = value.replacen(&header, "", 1).trim().to_owned();
|
||||
(header.replace(':', ""), value)
|
||||
}
|
||||
|
||||
pub struct AnisearchExtractor {}
|
||||
|
||||
impl AnisearchExtractor {
|
||||
pub fn new() -> Self {
|
||||
Self {}
|
||||
}
|
||||
|
||||
async fn anime_search(
|
||||
&self,
|
||||
url: Url,
|
||||
b: &mut crate::Browser,
|
||||
_conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
b.goto(url.to_string()).await.unwrap();
|
||||
let mut info: HashMap<String, Value> = HashMap::new();
|
||||
|
||||
info.insert(
|
||||
"query".into(),
|
||||
remove_last_n_chars(
|
||||
&b.get_element_text_by_xpath(r#"//*[@id="item-key-a-text"]"#)
|
||||
.await
|
||||
.unwrap()
|
||||
.replace("Title starts with \"", ""),
|
||||
1,
|
||||
)
|
||||
.into(),
|
||||
);
|
||||
|
||||
let mut results: Vec<String> = vec![];
|
||||
for result in b
|
||||
.get_elements_by_xpath(r#"//ul[@class="covers"]/li"#)
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
let link = b
|
||||
.get_url_from_link(result.find(By::XPath(".//a")).await.unwrap())
|
||||
.await;
|
||||
results.push(link);
|
||||
}
|
||||
|
||||
info.insert("results".into(), results.into());
|
||||
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
async fn anime(
|
||||
&self,
|
||||
url: Url,
|
||||
b: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
let mut info: HashMap<String, Value> = HashMap::new();
|
||||
|
||||
b.goto(url.to_string()).await.unwrap();
|
||||
|
||||
if b.wait_for(
|
||||
r#"//div[@class="needsclick cmp-root-container"]"#,
|
||||
Duration::from_secs(3),
|
||||
)
|
||||
.await
|
||||
{
|
||||
let script = r#"
|
||||
return document.querySelector("\#top > div.needsclick.cmp-root-container").shadowRoot.querySelector("\#consentDialog > div.cmp_ui.cmp_ext_text.cmp_state-stacks > div.cmp_navi > div > div.cmp_mainButtons > div > div.cmp_primaryButtonLine > div > div")
|
||||
"#;
|
||||
let accept_cookies = b.execute(script, ([]).to_vec()).await.unwrap();
|
||||
accept_cookies.element().unwrap().click().await.unwrap();
|
||||
}
|
||||
|
||||
let anime_info_section = b
|
||||
.get_element_by_xpath(r#"//section[@id="information"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let title_element = anime_info_section
|
||||
.find(By::XPath(r#".//div[@class="title"]//strong[@class="f16"]"#))
|
||||
.await
|
||||
.unwrap();
|
||||
info.insert(
|
||||
"original_title".into(),
|
||||
text_from!(anime_info_section
|
||||
.find(By::XPath(r#".//div[@class="title"]//div"#))
|
||||
.await
|
||||
.unwrap())
|
||||
.into(),
|
||||
);
|
||||
info.insert("title".into(), text_from!(title_element).into());
|
||||
|
||||
let cover_image_url = attr_from!(
|
||||
anime_info_section
|
||||
.find(By::XPath(r#".//figure[@id="cover-container"]/img"#))
|
||||
.await
|
||||
.unwrap(),
|
||||
"src"
|
||||
);
|
||||
info.insert(
|
||||
"cover".into(),
|
||||
handle_media_url(&cover_image_url, "cover", false, conf)
|
||||
.await
|
||||
.into(),
|
||||
);
|
||||
|
||||
let mut details: Map<String, Value> = Map::new();
|
||||
let details_elements = anime_info_section
|
||||
.find_all(By::XPath(r#"./div/ul/li[2]/ul/li[1]/div"#))
|
||||
.await
|
||||
.unwrap();
|
||||
for element in details_elements {
|
||||
if attr_from!(element, "class") == "title" {
|
||||
continue;
|
||||
}
|
||||
let (key, val) = split_header(&element).await;
|
||||
if attr_from!(element, "class") == "creators" {
|
||||
details.insert(
|
||||
escape_key(&key),
|
||||
val.split(", ").collect::<Vec<&str>>().into(),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
if attr_from!(element, "class") == "websites" {
|
||||
let mut links: Vec<String> = vec![];
|
||||
let links_html = element.find_all(By::XPath("./a")).await.unwrap();
|
||||
for l in links_html {
|
||||
links.push(attr_from!(l, "href"));
|
||||
}
|
||||
details.insert(escape_key(&key), links.into());
|
||||
continue;
|
||||
}
|
||||
details.insert(escape_key(&key), val.into());
|
||||
}
|
||||
info.insert("details".into(), details.into());
|
||||
|
||||
for desc in b
|
||||
.get_elements_by_xpath(r#"//section[@id="description"]//button"#)
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
let desc_lang = attr_from!(desc, "lang");
|
||||
if try_attr_from!(desc, "class").unwrap_or(String::new()) != "active" {
|
||||
let show_more_button = b
|
||||
.get_element_by_xpath(&format!(
|
||||
r#"//section[@id="description"]//button[@lang="{desc_lang}"]"#
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
b.scroll_to_element(&show_more_button).await.unwrap();
|
||||
show_more_button.click().await.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
let mut descriptions: Map<String, Value> = Map::new();
|
||||
for desc in b
|
||||
.get_elements_by_xpath(
|
||||
r#"//section[@id="description"]//div[@class="textblock details-text"]"#,
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
let desc_lang = attr_from!(desc, "lang");
|
||||
let desc_text = text_from!(desc);
|
||||
descriptions.insert(desc_lang, desc_text.into());
|
||||
}
|
||||
info.insert("description".into(), descriptions.into());
|
||||
|
||||
let tag_cloud = b
|
||||
.get_element_by_xpath(r#"//*[@id="description"]//ul[@class="cloud"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut genres = json!({"main": [], "sub": []}).as_object().unwrap().clone();
|
||||
let mut tags: Vec<String> = vec![];
|
||||
for tag in tag_cloud.find_all(By::XPath("./li/a")).await.unwrap() {
|
||||
if attr_from!(tag, "class") == "gg showpop" {
|
||||
genres
|
||||
.get_mut("main")
|
||||
.unwrap()
|
||||
.as_array_mut()
|
||||
.unwrap()
|
||||
.push(text_from!(tag).into());
|
||||
}
|
||||
if attr_from!(tag, "class") == "gc showpop" && !text_from!(tag).is_empty() {
|
||||
genres
|
||||
.get_mut("sub")
|
||||
.unwrap()
|
||||
.as_array_mut()
|
||||
.unwrap()
|
||||
.push(text_from!(tag).into());
|
||||
}
|
||||
if attr_from!(tag, "class") == "gt showpop" {
|
||||
tags.push(text_from!(tag));
|
||||
}
|
||||
}
|
||||
info.insert("genres".into(), genres.into());
|
||||
info.insert("tags".into(), tags.into());
|
||||
|
||||
if let Some(show_more_button) = b
|
||||
.get_element_by_xpath(r#"//*[@id="information"]/div/ul/li[2]/div/button"#)
|
||||
.await
|
||||
{
|
||||
b.scroll_to_element(&show_more_button).await.unwrap();
|
||||
show_more_button.click().await.unwrap();
|
||||
}
|
||||
|
||||
let lang_html = b
|
||||
.get_elements_by_xpath(r#"//*[@id="information"]/div/ul/li[2]/ul/li"#)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut dubs: Map<String, Value> = Map::new();
|
||||
|
||||
if let Some(ol) = b.get_element_by_xpath(r#"//div[@class="title"]"#).await {
|
||||
dubs.insert(attr_from!(ol, "lang"), Value::Object(Map::new()));
|
||||
}
|
||||
let mut subs: Map<String, Value> = Map::new();
|
||||
|
||||
for dub in lang_html {
|
||||
let lang_info = dub.find_all(By::XPath("./div")).await.unwrap();
|
||||
if lang_info.len() != 4 {
|
||||
continue;
|
||||
}
|
||||
|
||||
let lang_lang = attr_from!(lang_info[0], "lang");
|
||||
let mut is_dub = false;
|
||||
if lang_info[0]
|
||||
.find(By::XPath(r#".//span[@class="speaker"]"#))
|
||||
.await
|
||||
.is_ok()
|
||||
{
|
||||
is_dub = true;
|
||||
}
|
||||
|
||||
let lang_status = split_header(&lang_info[1]).await;
|
||||
let lang_release = split_header(&lang_info[2]).await;
|
||||
let lang_publisher = split_header(&lang_info[3]).await;
|
||||
|
||||
let lang_map = json!({
|
||||
escape_key(&lang_status.0): lang_status.1,
|
||||
escape_key(&lang_release.0): lang_release.1,
|
||||
escape_key(&lang_publisher.0): lang_publisher.1
|
||||
});
|
||||
if is_dub {
|
||||
dubs.insert(lang_lang, lang_map);
|
||||
} else {
|
||||
subs.insert(lang_lang, lang_map);
|
||||
}
|
||||
}
|
||||
info.insert("dubs".into(), dubs.into());
|
||||
info.insert("subs".into(), subs.into());
|
||||
|
||||
Ok(info)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Extractor for AnisearchExtractor {
|
||||
fn name(&self) -> String {
|
||||
"ANISEARCH".to_owned()
|
||||
}
|
||||
|
||||
fn supported_hosts(&self) -> Vec<&str> {
|
||||
vec!["www.anisearch.com"]
|
||||
}
|
||||
|
||||
async fn run_scrape(
|
||||
&self,
|
||||
url: Url,
|
||||
browser: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
if url.path().starts_with("/anime/index") {
|
||||
self.anime_search(url, browser, conf).await
|
||||
} else {
|
||||
self.anime(url, browser, conf).await
|
||||
}
|
||||
}
|
||||
}
|
228
src/extractors/aur.rs
Normal file
228
src/extractors/aur.rs
Normal file
|
@ -0,0 +1,228 @@
|
|||
use super::prelude::*;
|
||||
|
||||
pub struct AURExtractor {}
|
||||
|
||||
impl AURExtractor {
|
||||
pub fn new() -> Self {
|
||||
Self {}
|
||||
}
|
||||
|
||||
pub async fn aur_package(
|
||||
&self,
|
||||
url: Url,
|
||||
b: &mut crate::Browser,
|
||||
_conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
b.goto(url.to_string()).await.unwrap();
|
||||
|
||||
let head = text_from!(b
|
||||
.get_element_by_xpath(r#"//*[@id="pkgdetails"]/h2"#)
|
||||
.await
|
||||
.unwrap())[17..]
|
||||
.to_owned();
|
||||
let (name, version) = head.split_once(' ').unwrap();
|
||||
|
||||
let pkg_info = b
|
||||
.get_element_by_xpath(r#"//*[@id="pkginfo"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut info: HashMap<String, Value> = {
|
||||
let mut hm = HashMap::new();
|
||||
hm.insert("name".into(), name.into());
|
||||
hm.insert("version".into(), version.into());
|
||||
hm
|
||||
};
|
||||
|
||||
for row in pkg_info.find_all(By::Tag("tr")).await.unwrap() {
|
||||
match text_from!(row.find(By::Tag("th")).await.unwrap()).as_str() {
|
||||
"Git Clone URL:" => {
|
||||
info.insert(
|
||||
"clone".into(),
|
||||
attr_from!(row.find(By::Tag("a")).await.unwrap(), "href").into(),
|
||||
);
|
||||
}
|
||||
"Description:" => {
|
||||
info.insert(
|
||||
"description".into(),
|
||||
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
|
||||
);
|
||||
}
|
||||
"Upstream URL:" => {
|
||||
info.insert(
|
||||
"upstream".into(),
|
||||
attr_from!(row.find(By::Tag("a")).await.unwrap(), "href").into(),
|
||||
);
|
||||
}
|
||||
"Keywords:" => {
|
||||
let keyword_items = row.find_all(By::Tag("a")).await.unwrap();
|
||||
let mut keywords: Vec<String> = vec![];
|
||||
for kw in keyword_items {
|
||||
keywords.push(text_from!(kw));
|
||||
}
|
||||
info.insert("keywords".into(), keywords.into());
|
||||
}
|
||||
"Licenses:" => {
|
||||
info.insert(
|
||||
"license".into(),
|
||||
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
|
||||
);
|
||||
}
|
||||
"Submitter:" => {
|
||||
info.insert(
|
||||
"submitter".into(),
|
||||
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
|
||||
);
|
||||
}
|
||||
"Maintainer:" => {
|
||||
info.insert(
|
||||
"maintainer".into(),
|
||||
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
|
||||
);
|
||||
}
|
||||
"Last Packager:" => {
|
||||
info.insert(
|
||||
"last_packager".into(),
|
||||
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
|
||||
);
|
||||
}
|
||||
"Votes:" => {
|
||||
info.insert(
|
||||
"votes".into(),
|
||||
text_from!(row.find(By::Tag("td")).await.unwrap())
|
||||
.parse::<usize>()
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
}
|
||||
"Popularity:" => {
|
||||
info.insert(
|
||||
"popularity".into(),
|
||||
text_from!(row.find(By::Tag("td")).await.unwrap())
|
||||
.parse::<f64>()
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
}
|
||||
"First Submitted:" => {
|
||||
info.insert(
|
||||
"first_submitted".into(),
|
||||
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
|
||||
);
|
||||
}
|
||||
"Last Updated:" => {
|
||||
info.insert(
|
||||
"last_updated".into(),
|
||||
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
log::debug!("unknown column");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut dependencies: Vec<Value> = vec![];
|
||||
let dependency_items = b
|
||||
.get_element_by_xpath(r#"//*[@id="pkgdepslist"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut deps = dependency_items.find_all(By::Tag("li")).await.unwrap();
|
||||
if !deps.is_empty() {
|
||||
if text_from!(deps.last().unwrap()).contains("Show ") {
|
||||
b.goto(attr_from!(
|
||||
deps.last().unwrap().find(By::XPath("./a")).await.unwrap(),
|
||||
"href"
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
let dependency_items = b
|
||||
.get_element_by_xpath(r#"//*[@id="pkgdepslist"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
deps = dependency_items.find_all(By::Tag("li")).await.unwrap();
|
||||
}
|
||||
|
||||
for dep in deps {
|
||||
let dep_name = text_from!(dep.find(By::Tag("a")).await.unwrap());
|
||||
let dep_info =
|
||||
text_from!(dep.find_all(By::Tag("em")).await.unwrap().last().unwrap());
|
||||
dependencies.push(json!({
|
||||
"name": dep_name,
|
||||
"info": dep_info
|
||||
}));
|
||||
}
|
||||
}
|
||||
info.insert("dependencies".into(), dependencies.into());
|
||||
|
||||
let mut required_by: Vec<Value> = vec![];
|
||||
let required_by_items = b
|
||||
.get_element_by_xpath(r#"//*[@id="pkgreqslist"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut reqs = required_by_items.find_all(By::Tag("li")).await.unwrap();
|
||||
if !reqs.is_empty() {
|
||||
if text_from!(reqs.last().unwrap()).contains("Show ") {
|
||||
b.goto(attr_from!(
|
||||
reqs.last().unwrap().find(By::XPath("./a")).await.unwrap(),
|
||||
"href"
|
||||
))
|
||||
.await
|
||||
.unwrap();
|
||||
let required_by_items = b
|
||||
.get_element_by_xpath(r#"//*[@id="pkgreqslist"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
reqs = required_by_items.find_all(By::Tag("li")).await.unwrap();
|
||||
}
|
||||
|
||||
for req in reqs {
|
||||
let req_name = text_from!(req.find(By::Tag("a")).await.unwrap());
|
||||
let req_info =
|
||||
text_from!(req.find_all(By::Tag("em")).await.unwrap().last().unwrap());
|
||||
required_by.push(json!({
|
||||
"name": req_name,
|
||||
"optional": (req_info == "(optional)")
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
info.insert("required_by".into(), required_by.into());
|
||||
|
||||
let mut sources: Vec<String> = vec![];
|
||||
for source in b
|
||||
.get_element_by_xpath(r#"//*[@id="pkgsrcslist"]"#)
|
||||
.await
|
||||
.unwrap()
|
||||
.find_all(By::Tag("li"))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
sources.push(attr_from!(source.find(By::Tag("a")).await.unwrap(), "href"));
|
||||
}
|
||||
|
||||
info.insert("sources".into(), sources.into());
|
||||
|
||||
Ok(info)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Extractor for AURExtractor {
|
||||
fn name(&self) -> String {
|
||||
"AUR".to_owned()
|
||||
}
|
||||
|
||||
fn supported_hosts(&self) -> Vec<&str> {
|
||||
vec!["aur.archlinux.org"]
|
||||
}
|
||||
|
||||
async fn run_scrape(
|
||||
&self,
|
||||
url: Url,
|
||||
browser: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
self.aur_package(url, browser, conf).await
|
||||
}
|
||||
}
|
421
src/extractors/igdb.rs
Normal file
421
src/extractors/igdb.rs
Normal file
|
@ -0,0 +1,421 @@
|
|||
use super::prelude::*;
|
||||
use crate::util::{escape_key, handle_media_url, parse_date, remove_last_n_chars};
|
||||
|
||||
pub struct IGDBExtractor {}
|
||||
impl IGDBExtractor {
|
||||
pub const fn new() -> Self {
|
||||
Self {}
|
||||
}
|
||||
}
|
||||
|
||||
impl IGDBExtractor {
|
||||
async fn igdb_game(
|
||||
&self,
|
||||
url: Url,
|
||||
b: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
b.goto(url.to_string()).await.unwrap();
|
||||
|
||||
let mut info: HashMap<String, Value> = HashMap::new();
|
||||
|
||||
info.insert(
|
||||
"name".to_owned(),
|
||||
remove_last_n_chars(
|
||||
&b.get_element_text_by_xpath(r#"//*[@class="gamepage-title-wrapper"]/h1"#)
|
||||
.await
|
||||
.ok_or("could not get game name")?,
|
||||
4,
|
||||
)
|
||||
.into(),
|
||||
);
|
||||
|
||||
info.insert(
|
||||
"id".to_owned(),
|
||||
b.get_element_text_by_xpath(r#"//*[@class="optimisly-game-maininfo"]/div[1]/span"#)
|
||||
.await
|
||||
.ok_or("could not get id")?
|
||||
.into(),
|
||||
);
|
||||
|
||||
let cover_url = b
|
||||
.get_element_attr_by_xpath(r#"//*[@class="gamepage-cover"]/img[1]"#, "src")
|
||||
.await
|
||||
.ok_or("could not get cover url")?;
|
||||
info.insert(
|
||||
"cover".to_owned(),
|
||||
handle_media_url(
|
||||
&cover_url,
|
||||
&format!("igdb-{}-cover", info.get("id").unwrap().as_str().unwrap()),
|
||||
false,
|
||||
conf,
|
||||
)
|
||||
.await
|
||||
.into(),
|
||||
);
|
||||
|
||||
let genre_and_platform_htmls = b
|
||||
.get_elements_by_xpath(
|
||||
r#"//*[@class="gamepage-tabs"]/div[2]/p/span[@class="text-semibold"]/.."#,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let genres_html = genre_and_platform_htmls.first().unwrap();
|
||||
let mut genres_html = genres_html
|
||||
.find_all(thirtyfour::By::Tag("a"))
|
||||
.await
|
||||
.ok()
|
||||
.ok_or("could not get genres")?;
|
||||
let mut genres: Vec<String> = vec![];
|
||||
for genre in &mut genres_html {
|
||||
genres.push(text_from!(genre));
|
||||
}
|
||||
info.insert("genre".to_owned(), genres.into());
|
||||
|
||||
let platforms_html = genre_and_platform_htmls.get(1).unwrap();
|
||||
let platforms_txt = text_from!(platforms_html)[11..].to_owned();
|
||||
let mut platforms: Vec<String> = vec![];
|
||||
for platform in platforms_txt.split(", ") {
|
||||
platforms.push(platform.to_owned());
|
||||
}
|
||||
info.insert("platforms".to_owned(), platforms.into());
|
||||
|
||||
info.insert(
|
||||
"url".to_owned(),
|
||||
b.get_element_attr_by_xpath(
|
||||
r#"//*[@class="gamepage-tabs"]/div[4]/div[@class="input-group"]/input"#,
|
||||
"value",
|
||||
)
|
||||
.await
|
||||
.ok_or("could not get url")?
|
||||
.into(),
|
||||
);
|
||||
|
||||
let desc = b
|
||||
.get_element_by_xpath(r#"//*[@class="gamepage-tabs"]/div[2]/div[1]"#)
|
||||
.await
|
||||
.ok_or("could not get description")?;
|
||||
if let Ok(show_more) = desc.find(thirtyfour::By::Tag("span")).await {
|
||||
show_more.click().await.unwrap();
|
||||
}
|
||||
info.insert("description".to_owned(), text_from!(desc).into());
|
||||
|
||||
let date_str = b
|
||||
.get_element_text_by_xpath(r#"//*[@class="banner-subheading"]/span[1]/span[1]"#)
|
||||
.await
|
||||
.ok_or("could not get release date")?;
|
||||
info.insert(
|
||||
"release".to_owned(),
|
||||
if date_str == "TBD" {
|
||||
Option::<String>::None.into()
|
||||
} else {
|
||||
Some(parse_date(&date_str, "%b %d, %Y").ok_or("could not parse release date")?)
|
||||
.into()
|
||||
},
|
||||
);
|
||||
|
||||
let mut releases: Vec<Map<String, Value>> = vec![];
|
||||
let releases_html = b
|
||||
.find(thirtyfour::By::XPath(
|
||||
r#"//*[@class="optimisly-game-maininfo"]/div[2]"#,
|
||||
))
|
||||
.await
|
||||
.ok()
|
||||
.ok_or("could not get releases")?;
|
||||
for release in releases_html
|
||||
.find_all(thirtyfour::By::XPath("./*"))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
let release_platform = text_from!(release
|
||||
.find(thirtyfour::By::XPath("./div[1]/span"))
|
||||
.await
|
||||
.unwrap());
|
||||
|
||||
let release_info_html = release
|
||||
.find(By::XPath("./div[2]/div[1]/div[1]/span"))
|
||||
.await
|
||||
.unwrap();
|
||||
let release_date = text_from!(release_info_html.find(By::Tag("time")).await.unwrap());
|
||||
let release_info = text_from!(release_info_html.find(By::Tag("strong")).await.unwrap());
|
||||
releases.push(
|
||||
json!({
|
||||
"platform": release_platform,
|
||||
"date": release_date,
|
||||
"info": release_info
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
info.insert("releases".into(), releases.into());
|
||||
|
||||
let mut developers: Vec<String> = vec![];
|
||||
let developers_html = b
|
||||
.get_element_by_xpath(
|
||||
r#"//*[@class="optimisly-game-maininfo"]/div[@itemprop="author"]/span"#,
|
||||
)
|
||||
.await
|
||||
.ok_or("could not get developers")?;
|
||||
for dev in developers_html.find_all(By::Tag("a")).await.unwrap() {
|
||||
developers.push(text_from!(dev));
|
||||
}
|
||||
info.insert("developers".into(), developers.into());
|
||||
|
||||
let mut publishers: Vec<String> = vec![];
|
||||
if let Some(publishers_html) = b
|
||||
.get_element_by_xpath(
|
||||
r#"//*[@class="optimisly-game-maininfo"]/span[@itemprop="publisher"]/span"#,
|
||||
)
|
||||
.await
|
||||
{
|
||||
for publ in publishers_html.find_all(By::Tag("a")).await.unwrap() {
|
||||
publishers.push(text_from!(publ));
|
||||
}
|
||||
info.insert("publishers".into(), publishers.into());
|
||||
} else {
|
||||
log::warn!("could not get publishers");
|
||||
}
|
||||
|
||||
let mut ratings: Map<String, Value> = Map::new();
|
||||
let ratings_html = b
|
||||
.find(By::XPath(r#"//*[@class="gamepage-gauge"]"#))
|
||||
.await
|
||||
.ok()
|
||||
.ok_or("could not get ratings")?;
|
||||
let ratings_html = {
|
||||
let mut el: Vec<String> = vec![];
|
||||
for r in &ratings_html.find_all(By::Tag("text")).await.unwrap() {
|
||||
el.push(text_from!(r));
|
||||
}
|
||||
el
|
||||
};
|
||||
let ratings_txt: Vec<String> = ratings_html
|
||||
.into_iter()
|
||||
.filter(|x| x.chars().all(char::is_numeric) || x == "N/A")
|
||||
.collect();
|
||||
|
||||
ratings.insert(
|
||||
"member".into(),
|
||||
(if ratings_txt[0] == "N/A" {
|
||||
None
|
||||
} else {
|
||||
Some(ratings_txt[0].parse::<usize>().unwrap())
|
||||
})
|
||||
.into(),
|
||||
);
|
||||
ratings.insert(
|
||||
"critic".into(),
|
||||
(if ratings_txt[1] == "N/A" {
|
||||
None
|
||||
} else {
|
||||
Some(ratings_txt[1].parse::<usize>().unwrap())
|
||||
})
|
||||
.into(),
|
||||
);
|
||||
info.insert("ratings".into(), ratings.into());
|
||||
|
||||
let mut ttb: Map<String, Value> = Map::new();
|
||||
if let Some(ttb_data) = b
|
||||
.get_element_by_xpath(r#"//*[@id="content-page"]/div[2]/aside/table/tbody"#)
|
||||
.await
|
||||
{
|
||||
for row in ttb_data.find_all(By::Tag("tr")).await.unwrap() {
|
||||
ttb.insert(
|
||||
remove_last_n_chars(&text_from!(row.find(By::Tag("th")).await.unwrap()), 1),
|
||||
text_from!(row.find(By::Tag("td")).await.unwrap()).into(),
|
||||
);
|
||||
}
|
||||
info.insert("time_to_beat".into(), ttb.into());
|
||||
} else {
|
||||
log::warn!("could not get time to beat");
|
||||
}
|
||||
|
||||
b.scroll_to_end().await.unwrap();
|
||||
|
||||
if let Some(show_more) = b.get_element_by_xpath(r#"//*[@id="game-storyline"]/span[@class="text-purple cursor-pointer charLimitMore"]"#).await {
|
||||
show_more.click().await.unwrap();
|
||||
}
|
||||
if let Some(storyline_html) = b
|
||||
.get_element_text_by_xpath(r#"//*[@id="game-storyline"]/p"#)
|
||||
.await
|
||||
{
|
||||
info.insert("storyline".into(), storyline_html.into());
|
||||
} else {
|
||||
log::warn!("could not get storyline");
|
||||
}
|
||||
|
||||
let recommend_div = b
|
||||
.get_element_by_xpath(r#"//*[@id="content-page"]/div[2]/div[2]/ul/div[2]/div"#)
|
||||
.await
|
||||
.unwrap();
|
||||
let mut recommended: Vec<String> = vec![];
|
||||
for game in recommend_div.find_all(By::Tag("li")).await.unwrap() {
|
||||
let game_link = game.find(By::Tag("a")).await.unwrap();
|
||||
recommended.push(b.get_url_from_link(game_link).await);
|
||||
}
|
||||
info.insert("recommendations".into(), recommended.into());
|
||||
|
||||
b.scroll_to_end().await.unwrap();
|
||||
|
||||
if let Some(show_all_langs) = b
|
||||
.get_element_by_xpath(r#"//*[@class="language-supports-display"]/button"#)
|
||||
.await
|
||||
{
|
||||
show_all_langs.click().await.unwrap();
|
||||
}
|
||||
|
||||
for el in b.get_elements_by_xpath(r#"//*[@class="optimisly-game-extrainfo2"]/div/div/span[@class="text-purple cursor-pointer"]"#).await.unwrap() {
|
||||
el.click().await.unwrap();
|
||||
}
|
||||
let mut extra_info = String::new();
|
||||
let extra_info_html = b
|
||||
.get_element_by_xpath(r#"//*[@class="optimisly-game-extrainfo2"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
for el in extra_info_html.find_all(By::XPath("./*")).await.unwrap() {
|
||||
extra_info.push_str(&format!("{}\n", text_from!(el)));
|
||||
}
|
||||
|
||||
let mut extra_map: HashMap<String, Vec<Value>> = HashMap::new();
|
||||
let mut extra_map_new: HashMap<String, Value> = HashMap::new();
|
||||
let mut last = String::new();
|
||||
for line in extra_info.lines() {
|
||||
let line = line.trim();
|
||||
|
||||
if line.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if line.ends_with(':') {
|
||||
last = remove_last_n_chars(line, 1);
|
||||
extra_map.insert(last.clone(), vec![]);
|
||||
} else {
|
||||
extra_map.get_mut(&last).unwrap().push(line.into());
|
||||
}
|
||||
}
|
||||
|
||||
for key in extra_map.keys() {
|
||||
if key == "Localized titles" {
|
||||
let titles = extra_map.get(key).unwrap();
|
||||
let mut title_map: Map<String, Value> = Map::new();
|
||||
for title in titles {
|
||||
let (lang, val) = title.as_str().unwrap().split_once(": ").unwrap();
|
||||
title_map.insert(lang.into(), val.into());
|
||||
}
|
||||
extra_map_new.insert(key.into(), title_map.into());
|
||||
}
|
||||
if key == "Alternative titles" {
|
||||
let titles = extra_map.get(key).unwrap();
|
||||
let mut title_map: Map<String, Value> = Map::new();
|
||||
for title in titles {
|
||||
let (lang, val) = title.as_str().unwrap().split_once(": ").unwrap();
|
||||
title_map.insert(lang.into(), val.into());
|
||||
}
|
||||
extra_map_new.insert(key.into(), title_map.into());
|
||||
}
|
||||
if key == "Keywords" {
|
||||
let keywords = extra_map.get(key).unwrap()[0].to_string();
|
||||
extra_map_new.insert(
|
||||
key.into(),
|
||||
remove_last_n_chars(&keywords, 1)[1..]
|
||||
.split(", ")
|
||||
.map(std::string::ToString::to_string)
|
||||
.collect(),
|
||||
);
|
||||
}
|
||||
if key == "Supported Languages" {
|
||||
let mut supported_langs: Map<String, Value> = json!({
|
||||
"audio": Vec::<String>::new(),
|
||||
"subtitles": Vec::<String>::new(),
|
||||
"interface": Vec::<String>::new()
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone();
|
||||
|
||||
let lang_html = b
|
||||
.get_element_by_xpath(r#"//*[@class="language-supports-display"]/table/tbody"#)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
for lang in lang_html.find_all(By::Tag("tr")).await.unwrap() {
|
||||
let support = lang.find_all(By::Tag("td")).await.unwrap();
|
||||
let lang_name = remove_last_n_chars(&text_from!(support[0]), 1);
|
||||
if text_from!(support[1]) == "✓" {
|
||||
supported_langs
|
||||
.get_mut("audio")
|
||||
.unwrap()
|
||||
.as_array_mut()
|
||||
.unwrap()
|
||||
.push(lang_name.clone().into());
|
||||
}
|
||||
if text_from!(support[2]) == "✓" {
|
||||
supported_langs
|
||||
.get_mut("subtitles")
|
||||
.unwrap()
|
||||
.as_array_mut()
|
||||
.unwrap()
|
||||
.push(lang_name.clone().into());
|
||||
}
|
||||
if text_from!(support[3]) == "✓" {
|
||||
supported_langs
|
||||
.get_mut("interface")
|
||||
.unwrap()
|
||||
.as_array_mut()
|
||||
.unwrap()
|
||||
.push(lang_name.into());
|
||||
}
|
||||
}
|
||||
|
||||
extra_map_new.insert(key.into(), supported_langs.into());
|
||||
}
|
||||
}
|
||||
extra_map.remove_entry("Localized titles");
|
||||
extra_map.remove_entry("Alternative titles");
|
||||
extra_map.remove_entry("Keywords");
|
||||
extra_map.remove_entry("Supported Languages");
|
||||
|
||||
let extra_map: HashMap<String, Value> = extra_map
|
||||
.into_iter()
|
||||
.map(|(old_key, value)| {
|
||||
let new_key = escape_key(&old_key);
|
||||
(new_key, value.into())
|
||||
})
|
||||
.collect();
|
||||
let extra_map_new: HashMap<String, Value> = extra_map_new
|
||||
.into_iter()
|
||||
.map(|(old_key, value)| {
|
||||
let new_key = escape_key(&old_key);
|
||||
(new_key, value)
|
||||
})
|
||||
.collect();
|
||||
|
||||
info.extend(extra_map);
|
||||
info.extend(extra_map_new);
|
||||
|
||||
Ok(info)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Extractor for IGDBExtractor {
|
||||
fn name(&self) -> String {
|
||||
"IGDB".to_owned()
|
||||
}
|
||||
|
||||
fn supported_hosts(&self) -> Vec<&str> {
|
||||
vec!["www.igdb.com"]
|
||||
}
|
||||
|
||||
async fn run_scrape(
|
||||
&self,
|
||||
url: Url,
|
||||
b: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
self.igdb_game(url, b, conf).await
|
||||
}
|
||||
}
|
159
src/extractors/mediamarkt.rs
Normal file
159
src/extractors/mediamarkt.rs
Normal file
|
@ -0,0 +1,159 @@
|
|||
use crate::util::{currency, escape_key, remove_last_n_chars};
|
||||
|
||||
use super::prelude::*;
|
||||
|
||||
pub struct MediamarktExtractor {}
|
||||
|
||||
impl MediamarktExtractor {
|
||||
pub fn new() -> Self {
|
||||
Self {}
|
||||
}
|
||||
|
||||
async fn product(
|
||||
&self,
|
||||
url: Url,
|
||||
b: &mut crate::Browser,
|
||||
_conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
b.goto(url.to_string()).await.unwrap();
|
||||
|
||||
let mut info: HashMap<String, Value> = HashMap::new();
|
||||
|
||||
info.insert(
|
||||
"title".into(),
|
||||
b.get_element_text_by_xpath(r#"//div[@data-test="mms-select-details-header"]/h1"#)
|
||||
.await
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
|
||||
let product_info_elements = b
|
||||
.get_elements_by_xpath(
|
||||
r#"//div[@data-test="mms-select-details-header"]//p[@font-family="default"]"#,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
let re = regex::Regex::new(r"[-+]?\d*\.\d+|\d+").unwrap();
|
||||
let ratings: Vec<f64> = re
|
||||
.find_iter(&text_from!(product_info_elements.first().unwrap()))
|
||||
.map(|m| m.as_str().parse::<f64>().unwrap())
|
||||
.collect();
|
||||
|
||||
info.insert("rating".into(), ratings[0].into());
|
||||
info.insert("amount_of_ratings".into(), ratings[1].into());
|
||||
|
||||
info.insert(
|
||||
"product_number".into(),
|
||||
text_from!(product_info_elements[1])
|
||||
.replace("Art.-Nr. ", "")
|
||||
.into(),
|
||||
);
|
||||
|
||||
if let Some(discount) = b
|
||||
.get_element_text_by_xpath(
|
||||
r#"//div[@data-test="mms-product-price"]//div[@data-test="mms-badge"]/span"#,
|
||||
)
|
||||
.await
|
||||
{
|
||||
info.insert("discount".into(), discount.into());
|
||||
}
|
||||
if let Some(orig_price) = b.get_element_text_by_xpath(r#"//div[@data-test="mms-product-price"]//div[@data-test="mms-badge"]/../p[1]/span[3]"#).await {
|
||||
let orig_price = format!("{}{}", &orig_price.chars().skip(1).collect::<String>(), &orig_price.chars().take(1).collect::<String>());
|
||||
info.insert("original_price".into(), currency(&orig_price).into());
|
||||
}
|
||||
|
||||
let price = remove_last_n_chars(
|
||||
&b.get_element_text_by_xpath(r#"//span[@data-test="branded-price-whole-value"]"#)
|
||||
.await
|
||||
.unwrap(),
|
||||
1,
|
||||
);
|
||||
let price = format!(
|
||||
"{}{}",
|
||||
&price.chars().skip(2).collect::<String>(),
|
||||
&price.chars().take(2).collect::<String>()
|
||||
);
|
||||
info.insert("price".into(), currency(price.trim()).into());
|
||||
|
||||
if let Some(price_decimal) = b
|
||||
.get_element_text_by_xpath(r#"//span[@data-test="branded-price-decimal-value"]"#)
|
||||
.await
|
||||
{
|
||||
let decimal = if price_decimal == "–" {
|
||||
0.0
|
||||
} else {
|
||||
format!("0.{price_decimal}").parse::<f64>().unwrap()
|
||||
};
|
||||
let old_v = info
|
||||
.get_mut("price")
|
||||
.unwrap()
|
||||
.as_object_mut()
|
||||
.unwrap()
|
||||
.get_mut("value")
|
||||
.unwrap()
|
||||
.as_f64()
|
||||
.unwrap();
|
||||
info.get_mut("price")
|
||||
.unwrap()
|
||||
.as_object_mut()
|
||||
.unwrap()
|
||||
.insert("value".into(), (old_v + decimal).into());
|
||||
}
|
||||
|
||||
let mut data_information: Map<String, Value> = Map::new();
|
||||
let features_html = b
|
||||
.get_elements_by_xpath(r#"//div[@data-test="pdp-features-content"]/div/div/table"#)
|
||||
.await
|
||||
.unwrap();
|
||||
b.scroll_to_element(features_html.first().unwrap())
|
||||
.await
|
||||
.unwrap();
|
||||
for feature in features_html {
|
||||
let title = escape_key(&text_from!(feature
|
||||
.find(By::XPath("./thead//p"))
|
||||
.await
|
||||
.unwrap()));
|
||||
data_information.insert(title.clone(), Value::Object(Map::new()));
|
||||
b.scroll_to_element(&feature).await.unwrap();
|
||||
for info in feature.find_all(By::XPath("./tbody/tr")).await.unwrap() {
|
||||
b.scroll_to_element(&info).await.unwrap();
|
||||
std::thread::sleep(std::time::Duration::from_millis(50));
|
||||
let info_html = info.find_all(By::XPath("./td/p")).await.unwrap();
|
||||
let key = escape_key(&text_from!(info_html[0]));
|
||||
if key.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let val = text_from!(info_html[1]);
|
||||
data_information
|
||||
.get_mut(&title)
|
||||
.unwrap()
|
||||
.as_object_mut()
|
||||
.unwrap()
|
||||
.insert(key, val.into());
|
||||
}
|
||||
}
|
||||
info.insert("information".into(), data_information.into());
|
||||
|
||||
Ok(info)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Extractor for MediamarktExtractor {
|
||||
fn name(&self) -> String {
|
||||
"MEDIAMARKT".to_owned()
|
||||
}
|
||||
|
||||
fn supported_hosts(&self) -> Vec<&str> {
|
||||
vec!["www.mediamarkt.de"]
|
||||
}
|
||||
|
||||
async fn run_scrape(
|
||||
&self,
|
||||
url: Url,
|
||||
b: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
self.product(url, b, conf).await
|
||||
}
|
||||
}
|
158
src/extractors/mod.rs
Normal file
158
src/extractors/mod.rs
Normal file
|
@ -0,0 +1,158 @@
|
|||
use std::collections::HashMap;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use chrono::Utc;
|
||||
use serde_json::Value;
|
||||
use url::Url;
|
||||
mod amazon;
|
||||
mod anilist;
|
||||
mod anisearch;
|
||||
mod aur;
|
||||
mod igdb;
|
||||
mod mediamarkt;
|
||||
mod postman;
|
||||
mod steam;
|
||||
mod tmdb;
|
||||
|
||||
use crate::Config;
|
||||
|
||||
mod prelude {
|
||||
pub use super::Extractor;
|
||||
pub use crate::Config;
|
||||
pub use crate::{attr_from, text_from, try_attr_from};
|
||||
pub use async_trait::async_trait;
|
||||
pub use serde_json::{json, Map, Value};
|
||||
pub use std::collections::HashMap;
|
||||
pub use std::time::Duration;
|
||||
pub use thirtyfour::By;
|
||||
pub use url::Url;
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
/// A trait for defining custom extractors to scrape data from web pages.
|
||||
pub trait Extractor {
|
||||
/// Checks if the provided URL can be handled by this extractor.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `url` - The URL to be matched against the extractor's capabilities.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns `true` if the extractor can handle the given URL, otherwise `false`
|
||||
fn match_url(&self, url: &Url) -> bool {
|
||||
if let Some(host_str) = url.host_str() {
|
||||
if self.supported_hosts().contains(&host_str) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Retrieves supported hosts for the extractor
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a `Vec` of hosts supported by this extractor.
|
||||
fn supported_hosts(&self) -> Vec<&str>;
|
||||
|
||||
/// Retrieves the name of the extractor.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a `String` containing the name of the extractor.
|
||||
fn name(&self) -> String;
|
||||
|
||||
/// Performs the web scraping operation on the provided URL using the given browser
|
||||
/// instance and configuration.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `url` - The URL to perform scraping on.
|
||||
/// * `browser` - A mutable reference to the browser instance used for scraping.
|
||||
/// * `conf` - A reference to the configuration settings for scraping.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// Returns a `Result` indicating either a successful scraping operation with a
|
||||
/// `HashMap` containing extracted data, or an error message as a `String` if the
|
||||
/// operation fails.
|
||||
async fn run_scrape(
|
||||
&self,
|
||||
url: Url,
|
||||
browser: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String>;
|
||||
}
|
||||
|
||||
/// Get a list of all extractors registered.
|
||||
#[must_use]
|
||||
pub fn get_extractors() -> Vec<Box<dyn Extractor>> {
|
||||
vec![
|
||||
Box::new(igdb::IGDBExtractor::new()),
|
||||
Box::new(aur::AURExtractor::new()),
|
||||
Box::new(amazon::AmazonExtractor::new()),
|
||||
Box::new(anilist::AnilistExtractor::new()),
|
||||
Box::new(anisearch::AnisearchExtractor::new()),
|
||||
Box::new(mediamarkt::MediamarktExtractor::new()),
|
||||
Box::new(postman::PostmanExtractor::new()),
|
||||
Box::new(steam::SteamExtractor::new()),
|
||||
Box::new(tmdb::TmdbExtractor::new()),
|
||||
]
|
||||
}
|
||||
|
||||
pub async fn scrape_url(url: &str, conf: &Config) {
|
||||
let p_url = Url::parse(url).expect("Invalid URL");
|
||||
|
||||
let ts = Utc::now();
|
||||
|
||||
let mut data: Option<Result<HashMap<String, Value>, String>> = None;
|
||||
|
||||
let mut browser = crate::Browser::new(conf).await.unwrap();
|
||||
|
||||
if let Some(force_ext) = &conf.force_extractor {
|
||||
let extractors = get_extractors();
|
||||
let ex = extractors
|
||||
.iter()
|
||||
.find(|x| x.name() == *force_ext)
|
||||
.unwrap()
|
||||
.to_owned();
|
||||
log::info!("Scraping '{}'", p_url.to_string());
|
||||
log::info!("Using extractor {}", ex.name());
|
||||
data = Some(ex.run_scrape(p_url, &mut browser, conf).await);
|
||||
} else {
|
||||
for x in get_extractors() {
|
||||
if x.match_url(&p_url) {
|
||||
log::info!("Scraping '{}'", p_url.to_string());
|
||||
log::info!("Using extractor {}", x.name());
|
||||
data = Some(x.run_scrape(p_url, &mut browser, conf).await);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
browser.quit().await;
|
||||
|
||||
if data.is_none() {
|
||||
log::error!("Site not supported");
|
||||
std::process::exit(1);
|
||||
}
|
||||
|
||||
let data = data.unwrap();
|
||||
|
||||
let mut data = match data {
|
||||
Ok(data) => data,
|
||||
Err(e) => {
|
||||
let mut h = HashMap::new();
|
||||
log::error!("Scrape failed: {e}");
|
||||
h.insert("error".into(), e.into());
|
||||
h
|
||||
}
|
||||
};
|
||||
|
||||
if conf.save_ts {
|
||||
data.insert("scraped_at".to_string(), ts.timestamp_nanos().into());
|
||||
}
|
||||
|
||||
println!("{}", serde_json::to_string(&data).unwrap());
|
||||
}
|
322
src/extractors/postman.rs
Normal file
322
src/extractors/postman.rs
Normal file
|
@ -0,0 +1,322 @@
|
|||
use crate::util::{escape_unsafe_characters, handle_media_url, remove_last_n_chars, window};
|
||||
|
||||
use super::prelude::*;
|
||||
|
||||
pub struct PostmanExtractor {}
|
||||
|
||||
impl PostmanExtractor {
|
||||
pub fn new() -> Self {
|
||||
Self {}
|
||||
}
|
||||
}
|
||||
|
||||
impl PostmanExtractor {
|
||||
async fn torrent(
|
||||
&self,
|
||||
url: Url,
|
||||
b: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
b.goto(url.to_string()).await.unwrap();
|
||||
|
||||
let mut info: HashMap<String, Value> = HashMap::new();
|
||||
|
||||
let info_table_html = b
|
||||
.get_element_by_xpath(r#"//*[@id="td_props"]/tbody"#)
|
||||
.await
|
||||
.ok_or("could not get info table")?;
|
||||
let entries = info_table_html.find_all(By::Tag("tr")).await.unwrap();
|
||||
for entry in entries.iter().take(entries.len() - 1) {
|
||||
if let Ok(key_name_el) = entry.find(By::XPath(r#"./td[@class="label"]/b"#)).await {
|
||||
let key_name = text_from!(key_name_el);
|
||||
let content = text_from!(entry.find(By::XPath("./td[2]")).await.unwrap());
|
||||
match key_name.as_str() {
|
||||
"Name:" => {
|
||||
info.insert("name".into(), content.into());
|
||||
}
|
||||
"Torrent file:" => {
|
||||
let torrent_file_url = b
|
||||
.get_url_from_link(entry.find(By::XPath("./td[2]/a[1]")).await.unwrap())
|
||||
.await;
|
||||
info.insert(
|
||||
"torrent_file".into(),
|
||||
handle_media_url(&torrent_file_url, &content, true, conf)
|
||||
.await
|
||||
.into(),
|
||||
);
|
||||
}
|
||||
"Magnet:" => {
|
||||
info.insert(
|
||||
"magnet_url".into(),
|
||||
attr_from!(
|
||||
entry.find(By::XPath("./td[2]/a[1]")).await.unwrap(),
|
||||
"href"
|
||||
)
|
||||
.into(),
|
||||
);
|
||||
}
|
||||
"Infohash:" => {
|
||||
info.insert("infohash".into(), content.into());
|
||||
}
|
||||
"Size:" => {
|
||||
info.insert("size".into(), content.into());
|
||||
}
|
||||
"Owner:" => {
|
||||
if content != "hidden" && content != "none (abandoned torrent)" {
|
||||
info.insert("owner".into(), content.into());
|
||||
let level = attr_from!(
|
||||
entry
|
||||
.find(By::XPath("./td[2]/span[1]/img[1]"))
|
||||
.await
|
||||
.unwrap(),
|
||||
"src"
|
||||
);
|
||||
info.insert(
|
||||
"owner_level".into(),
|
||||
remove_last_n_chars(level.split('/').last().unwrap(), 4)
|
||||
.parse::<isize>()
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
}
|
||||
}
|
||||
"Main Languages:" => {
|
||||
let languages_html =
|
||||
entry.find_all(By::XPath("./td[2]/span")).await.unwrap();
|
||||
let mut languages: Vec<String> = vec![];
|
||||
for lang in languages_html {
|
||||
languages.push(attr_from!(lang, "title"));
|
||||
}
|
||||
info.insert("main_languages".into(), languages.into());
|
||||
}
|
||||
"Subtitle Languages:" => {
|
||||
let languages_html =
|
||||
entry.find_all(By::XPath("./td[2]/span")).await.unwrap();
|
||||
let mut languages: Vec<String> = vec![];
|
||||
for lang in languages_html {
|
||||
languages.push(attr_from!(lang, "title"));
|
||||
}
|
||||
info.insert("subtitle_languages".into(), languages.into());
|
||||
}
|
||||
"Hits / Downloads:" => {
|
||||
let (hits_amount, downloads_amount) = content.split_once(" / ").unwrap();
|
||||
info.insert(
|
||||
"hits_amount".into(),
|
||||
hits_amount.parse::<isize>().unwrap().into(),
|
||||
);
|
||||
info.insert(
|
||||
"downloads_amount".into(),
|
||||
downloads_amount.parse::<isize>().unwrap().into(),
|
||||
);
|
||||
}
|
||||
"Seeders / Leechers:" => {
|
||||
let (seeders_amount, leechers_amount) = content.split_once(" / ").unwrap();
|
||||
info.insert(
|
||||
"seeders_amount".into(),
|
||||
seeders_amount.parse::<isize>().unwrap().into(),
|
||||
);
|
||||
info.insert(
|
||||
"leechers_amount".into(),
|
||||
leechers_amount.parse::<isize>().unwrap().into(),
|
||||
);
|
||||
}
|
||||
"Added / Last Active:" => {
|
||||
let (added_timestamp, last_active_timestamp) =
|
||||
content.split_once(" / ").unwrap();
|
||||
info.insert("added_timestamp".into(), added_timestamp.into());
|
||||
info.insert(
|
||||
"last_active_timestamp".into(),
|
||||
(if last_active_timestamp == "No active seeders in DB" {
|
||||
None
|
||||
} else {
|
||||
Some(last_active_timestamp)
|
||||
})
|
||||
.into(),
|
||||
);
|
||||
}
|
||||
"Rating:" => {
|
||||
info.insert(
|
||||
"rating".into(),
|
||||
attr_from!(
|
||||
entry
|
||||
.find(By::XPath(r#"./td[2]/span[@id="ratingbars"]"#))
|
||||
.await
|
||||
.unwrap(),
|
||||
"title"
|
||||
)
|
||||
.split_whitespace()
|
||||
.next()
|
||||
.unwrap()
|
||||
.parse::<f64>()
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
}
|
||||
"Description:" => {
|
||||
info.insert("description".into(), content.into());
|
||||
}
|
||||
"Category:" => {
|
||||
info.insert("category".into(), content.into());
|
||||
}
|
||||
"Subtitles:" => {
|
||||
if !content.is_empty() {
|
||||
info.insert("subtitles".into(), content.into());
|
||||
}
|
||||
}
|
||||
"Length:" => {
|
||||
if !content.is_empty() {
|
||||
info.insert("length".into(), content.into());
|
||||
}
|
||||
}
|
||||
"Genre:" => {
|
||||
if !content.is_empty() {
|
||||
info.insert("genre".into(), content.into());
|
||||
}
|
||||
}
|
||||
"Codec:" => {
|
||||
if !content.is_empty() {
|
||||
info.insert("codec".into(), content.into());
|
||||
}
|
||||
}
|
||||
"Ripper Info:" => {
|
||||
if !content.is_empty() {
|
||||
info.insert("ripper_info".into(), content.into());
|
||||
}
|
||||
}
|
||||
"Format:" => {
|
||||
if !content.is_empty() {
|
||||
info.insert("format".into(), content.into());
|
||||
}
|
||||
}
|
||||
"Bitrate:" => {
|
||||
if !content.is_empty() {
|
||||
info.insert("bitrate".into(), content.into());
|
||||
}
|
||||
}
|
||||
"Banned:" => {
|
||||
info.insert("banned".into(), (content == "yes").into());
|
||||
}
|
||||
"Immutable:" => {
|
||||
info.insert("immutable".into(), (content == "yes").into());
|
||||
}
|
||||
"Visible:" => {
|
||||
info.insert("visible".into(), (content == "yes").into());
|
||||
}
|
||||
"Comment Handling:" => {}
|
||||
_ => {
|
||||
log::debug!("unknown key {key_name}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut files: Vec<Map<String, Value>> = vec![];
|
||||
let files_info_html: Vec<_> = b
|
||||
.get_elements_by_xpath(r#"//*[@id="td_files"]/tbody/*"#)
|
||||
.await
|
||||
.ok_or("could not get files info")?
|
||||
.into_iter()
|
||||
.skip(1)
|
||||
.collect();
|
||||
for entry in files_info_html {
|
||||
let file_name = text_from!(entry.find(By::XPath("./td[1]")).await.unwrap());
|
||||
let file_size = text_from!(entry.find(By::XPath("./td[2]")).await.unwrap());
|
||||
files.push(
|
||||
json!({
|
||||
"file_name": file_name,
|
||||
"file_size": file_size
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
info.insert("files".into(), files.into());
|
||||
|
||||
let mut attachments: Map<String, Value> = Map::new();
|
||||
let attachment_html = b
|
||||
.get_element_by_xpath(r#"//table[@id="td_attachments"]/tbody/tr[1]/td[1]"#)
|
||||
.await
|
||||
.ok_or("could not get attachments")?;
|
||||
for el in attachment_html
|
||||
.find_all(By::XPath("./a/img"))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
let attachment_title = attr_from!(el, "title");
|
||||
let attachment_url = b.get_absolute_url(&attr_from!(el, "src")).await;
|
||||
attachments.insert(
|
||||
attachment_title.clone(),
|
||||
handle_media_url(
|
||||
&attachment_url,
|
||||
&format!("{}.png", escape_unsafe_characters(&attachment_title)),
|
||||
true,
|
||||
conf,
|
||||
)
|
||||
.await
|
||||
.into(),
|
||||
);
|
||||
}
|
||||
info.insert("attachments".into(), attachments.into());
|
||||
|
||||
if let Some(comments_html) = b.get_element_by_xpath(r#"//*[@id="comments"]/tbody"#).await {
|
||||
let mut comments: Vec<Map<String, Value>> = vec![];
|
||||
for comment in window(&comments_html.find_all(By::Tag("tr")).await.unwrap(), 2) {
|
||||
let comment_user =
|
||||
text_from!(comment[0].find(By::XPath("./th/b/i")).await.unwrap());
|
||||
let comment_ts = text_from!(comment[0]
|
||||
.find(By::XPath(r#"./th/span[@class="commentdate"]"#))
|
||||
.await
|
||||
.unwrap())[7..]
|
||||
.to_owned();
|
||||
let mut comment_content = text_from!(comment[1]
|
||||
.find(By::XPath(r#"./td/span[@class="commenttext"]"#))
|
||||
.await
|
||||
.unwrap());
|
||||
let comment_content_html = comment[1]
|
||||
.find_all(By::XPath(r#"./td/span[@class="commenttext"]/a"#))
|
||||
.await
|
||||
.unwrap();
|
||||
for el in comment_content_html {
|
||||
if el.tag_name().await.unwrap().as_str() == "a" {
|
||||
let link = format!("[{}]({})", text_from!(el), attr_from!(el, "href"));
|
||||
comment_content = comment_content.replace(&text_from!(el), &link);
|
||||
}
|
||||
}
|
||||
comments.push(
|
||||
json!({
|
||||
"user": comment_user,
|
||||
"timestamp": comment_ts,
|
||||
"content": comment_content
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
info.insert("comments".into(), comments.into());
|
||||
}
|
||||
|
||||
Ok(info)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Extractor for PostmanExtractor {
|
||||
fn name(&self) -> String {
|
||||
"POSTMAN".to_owned()
|
||||
}
|
||||
|
||||
fn supported_hosts(&self) -> Vec<&str> {
|
||||
vec!["tracker2.postman.i2p"]
|
||||
}
|
||||
|
||||
async fn run_scrape(
|
||||
&self,
|
||||
url: Url,
|
||||
b: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
self.torrent(url, b, conf).await
|
||||
}
|
||||
}
|
145
src/extractors/steam.rs
Normal file
145
src/extractors/steam.rs
Normal file
|
@ -0,0 +1,145 @@
|
|||
use crate::util::{currency, parse_date};
|
||||
|
||||
use super::prelude::*;
|
||||
|
||||
pub struct SteamExtractor;
|
||||
|
||||
impl SteamExtractor {
|
||||
pub fn new() -> Self {
|
||||
Self {}
|
||||
}
|
||||
|
||||
async fn steam_game(
|
||||
&self,
|
||||
url: Url,
|
||||
b: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
let mut url = url;
|
||||
let lang = conf.language.clone();
|
||||
|
||||
match lang {
|
||||
crate::Language::de_DE => {
|
||||
url.query_pairs_mut().append_pair("l", "german");
|
||||
}
|
||||
crate::Language::en_US => {
|
||||
url.query_pairs_mut().append_pair("l", "english");
|
||||
}
|
||||
}
|
||||
log::info!(
|
||||
"Changing to '{url}' because of {:?} language",
|
||||
conf.language
|
||||
);
|
||||
|
||||
b.goto(url.to_string()).await.unwrap();
|
||||
|
||||
if b.current_url().await.unwrap().path().contains("agecheck") {
|
||||
log::info!("Game is behind age restriction");
|
||||
let year = b
|
||||
.get_element_by_xpath(r#"//*[@id="ageYear"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
thirtyfour::components::SelectElement::new(&year)
|
||||
.await
|
||||
.unwrap()
|
||||
.select_by_value("1900")
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
b.click_on_xpath(r#"//*[@id="view_product_page_btn"]"#)
|
||||
.await;
|
||||
|
||||
b.wait_for(r#"//*[@id="appHubAppName"]"#, Duration::from_secs(5))
|
||||
.await;
|
||||
b.goto(url.to_string()).await.unwrap();
|
||||
}
|
||||
|
||||
let game_name = b
|
||||
.get_element_text_by_xpath(r#"//*[@id="appHubAppName"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let game_description = b
|
||||
.get_element_text_by_xpath(r#"//*[@class="game_description_snippet"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let game_release = b
|
||||
.get_element_text_by_xpath(r#"//*[@class="release_date"]/div[2]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let game_release = match lang {
|
||||
crate::Language::de_DE => parse_date(&game_release, "%d. %b. %Y").unwrap(),
|
||||
crate::Language::en_US => parse_date(&game_release, "%d %b, %Y").unwrap(),
|
||||
};
|
||||
|
||||
let game_developer = b
|
||||
.get_element_text_by_xpath(r#"//*[@id="developers_list"]/a"#)
|
||||
.await
|
||||
.unwrap();
|
||||
let game_publisher = text_from!(b
|
||||
.get_elements_by_xpath(r#"//*[@class="dev_row"]/div[2]"#)
|
||||
.await
|
||||
.unwrap()
|
||||
.get(1)
|
||||
.unwrap());
|
||||
|
||||
let mut game_price: Option<Map<String, Value>> = None;
|
||||
|
||||
if let Some(game_orig_price_html) = b.get_elements_by_xpath(r#"//*[@class="game_area_purchase_game_wrapper"]/div/div[2]/div/div[1]/div[2]/*[@class="discount_original_price"]"#).await.unwrap().first() {
|
||||
let game_orig_price = text_from!(game_orig_price_html);
|
||||
let game_discount_price = text_from!(b.get_elements_by_xpath(r#"//*[@class="game_area_purchase_game_wrapper"]/div/div[2]/div/div[1]/div[2]/*[@class="discount_final_price"]"#).await.unwrap().first().unwrap());
|
||||
game_price = Some(json!({
|
||||
"original_price": currency(&game_orig_price),
|
||||
"discount_price": currency(&game_discount_price),
|
||||
}).as_object().unwrap().clone());
|
||||
} else if let Some(game_price_html) = b.get_element_text_by_xpath(r#"//*[@class="game_area_purchase_game_wrapper"]/div/div[2]/div/*[@class="game_purchase_price price"]"#).await {
|
||||
game_price = Some(currency(&game_price_html));
|
||||
} else {
|
||||
let check_free_price = b.get_element_text_by_xpath(r#"//*[@class="game_purchase_action"]/div[1]/div[@class="game_purchase_price price"]"#).await.unwrap();
|
||||
match lang {
|
||||
crate::Language::de_DE => {
|
||||
if check_free_price == "Kostenlos" {
|
||||
game_price = Some(currency("0.0€"));
|
||||
}
|
||||
},
|
||||
crate::Language::en_US => {
|
||||
if check_free_price == "Free" {
|
||||
game_price = Some(currency("0.0$"));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut info = HashMap::new();
|
||||
info.insert("name".into(), game_name.into());
|
||||
info.insert("description".into(), game_description.into());
|
||||
info.insert("release".into(), game_release.into());
|
||||
info.insert("developer".into(), game_developer.into());
|
||||
info.insert("publisher".into(), game_publisher.into());
|
||||
info.insert("price".into(), game_price.into());
|
||||
|
||||
Ok(info)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Extractor for SteamExtractor {
|
||||
fn supported_hosts(&self) -> Vec<&str> {
|
||||
vec!["store.steampowered.com"]
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
"STEAM".to_string()
|
||||
}
|
||||
|
||||
async fn run_scrape(
|
||||
&self,
|
||||
url: Url,
|
||||
browser: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
self.steam_game(url, browser, conf).await
|
||||
}
|
||||
}
|
338
src/extractors/tmdb.rs
Normal file
338
src/extractors/tmdb.rs
Normal file
|
@ -0,0 +1,338 @@
|
|||
use crate::util::{
|
||||
escape_key, extract_attrs_from_elements, extract_texts_from_elements, handle_media_url,
|
||||
parse_date, remove_last_n_chars,
|
||||
};
|
||||
|
||||
use super::prelude::*;
|
||||
|
||||
pub struct TmdbExtractor;
|
||||
|
||||
impl TmdbExtractor {
|
||||
pub fn new() -> Self {
|
||||
Self {}
|
||||
}
|
||||
|
||||
async fn series(
|
||||
&self,
|
||||
url: Url,
|
||||
b: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
b.goto(url.to_string()).await.unwrap();
|
||||
|
||||
let prefs = json!({
|
||||
"i18n_fallback_language": "en-US",
|
||||
"locale": "en-US",
|
||||
"country_code": "US",
|
||||
});
|
||||
let prefs_str = serde_json::to_string(&prefs).unwrap();
|
||||
let lang_cookie =
|
||||
thirtyfour::Cookie::build("tmdb.prefs", urlencoding::encode(&prefs_str).into_owned())
|
||||
.domain("www.themoviedb.org")
|
||||
.path("/")
|
||||
.expires(None)
|
||||
.secure(true)
|
||||
.http_only(true)
|
||||
.same_site(thirtyfour::cookie::SameSite::Lax)
|
||||
.finish();
|
||||
b.delete_cookie("tmdb.prefs").await.unwrap();
|
||||
b.add_cookie(lang_cookie).await.unwrap();
|
||||
b.refresh().await.unwrap();
|
||||
|
||||
let mut info: HashMap<String, Value> = HashMap::new();
|
||||
|
||||
if b.get_element_by_xpath(r#"//*[@id="main"]//div[@class="error_wrapper"]"#)
|
||||
.await
|
||||
.is_some()
|
||||
{
|
||||
Err("page unavailable")?;
|
||||
}
|
||||
|
||||
info.insert(
|
||||
"title".into(),
|
||||
b.get_element_text_by_xpath(
|
||||
r#"//*[@id="original_header"]//section[@class="header poster"]/div/h2/a"#,
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
info.insert(
|
||||
"release_year".into(),
|
||||
remove_last_n_chars(
|
||||
&b.get_element_text_by_xpath(
|
||||
r#"//*[@id="original_header"]//section[@class="header poster"]/div/h2/span"#,
|
||||
)
|
||||
.await
|
||||
.unwrap()[1..],
|
||||
1,
|
||||
)
|
||||
.into(),
|
||||
);
|
||||
|
||||
let rating_html = attr_from!(
|
||||
b.get_element_by_xpath(r#"//div[@class="user_score_chart"]/div[1]/span"#)
|
||||
.await
|
||||
.unwrap(),
|
||||
"class"
|
||||
);
|
||||
let rating = rating_html.split("icon-r").nth(1).unwrap();
|
||||
info.insert(
|
||||
"user_rating".into(),
|
||||
rating.parse::<isize>().unwrap().into(),
|
||||
);
|
||||
|
||||
if let Some(age_certification) = b
|
||||
.get_element_text_by_xpath(
|
||||
r#"//*[@id="original_header"]//span[@class="certification"]"#,
|
||||
)
|
||||
.await
|
||||
{
|
||||
info.insert("age_certification".into(), age_certification.into());
|
||||
}
|
||||
|
||||
let genres_html = b
|
||||
.get_elements_by_xpath(r#"//*[@id="original_header"]//span[@class="genres"]/a"#)
|
||||
.await
|
||||
.unwrap();
|
||||
info.insert(
|
||||
"genres".into(),
|
||||
extract_texts_from_elements(genres_html).await.into(),
|
||||
);
|
||||
|
||||
info.insert(
|
||||
"overview".into(),
|
||||
b.get_element_text_by_xpath(r#"//*[@id="original_header"]//div[@class="overview"]"#)
|
||||
.await
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
|
||||
let cover_url = b
|
||||
.get_absolute_url(
|
||||
&attr_from!(
|
||||
b.get_element_by_xpath(
|
||||
r#"//*[@id="original_header"]//div[@class="poster"]//img"#
|
||||
)
|
||||
.await
|
||||
.unwrap(),
|
||||
"src"
|
||||
)
|
||||
.replace("_filter(blur)", ""),
|
||||
)
|
||||
.await;
|
||||
info.insert(
|
||||
"cover".into(),
|
||||
handle_media_url(&cover_url, "cover", false, conf)
|
||||
.await
|
||||
.into(),
|
||||
);
|
||||
|
||||
for fact in b
|
||||
.get_elements_by_xpath(r#"//*[@id="media_v4"]//section[@class="facts left_column"]/p"#)
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
if let Ok(key) = fact.find(By::XPath("./strong")).await {
|
||||
let key = text_from!(key);
|
||||
if key == "Networks" {
|
||||
continue;
|
||||
}
|
||||
info.insert(
|
||||
escape_key(&key),
|
||||
text_from!(fact).replace(&format!("{key}\n"), "").into(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let mut tags: Vec<String> = vec![];
|
||||
for tag in b
|
||||
.get_elements_by_xpath(
|
||||
r#"//*[@id="media_v4"]//section[@class="keywords right_column"]/ul[1]/li"#,
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
tags.push(text_from!(tag.find(By::XPath("./a")).await.unwrap()));
|
||||
}
|
||||
info.insert("tags".into(), tags.into());
|
||||
|
||||
let all_seasons_url = b
|
||||
.get_url_from_link(
|
||||
b.get_element_by_xpath(
|
||||
r#"//*[@id="media_v4"]//section[@class="panel season"]/p[1]/a"#,
|
||||
)
|
||||
.await
|
||||
.unwrap(),
|
||||
)
|
||||
.await;
|
||||
b.goto(all_seasons_url).await.unwrap();
|
||||
|
||||
let mut seasons: Vec<Map<String, Value>> = vec![];
|
||||
let mut seasons_urls: Vec<String> = vec![];
|
||||
for s in extract_attrs_from_elements(
|
||||
b.get_elements_by_xpath(
|
||||
r#"//*[@id="media_v4"]//div[@class="season_wrapper"]/section/div/a"#,
|
||||
)
|
||||
.await
|
||||
.unwrap(),
|
||||
"href",
|
||||
)
|
||||
.await
|
||||
{
|
||||
seasons_urls.push(b.get_absolute_url(&s).await);
|
||||
}
|
||||
|
||||
for season in seasons_urls {
|
||||
let season_data = self.season_page(season, b, conf).await?;
|
||||
seasons.push(season_data);
|
||||
}
|
||||
|
||||
info.insert("seasons".into(), seasons.into());
|
||||
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
async fn season_page(
|
||||
&self,
|
||||
url: String,
|
||||
b: &mut crate::Browser,
|
||||
_conf: &Config,
|
||||
) -> Result<Map<String, Value>, String> {
|
||||
b.goto(url.clone()).await.unwrap();
|
||||
|
||||
let mut season: Map<String, Value> = Map::new();
|
||||
season.insert(
|
||||
"season_number".into(),
|
||||
url.split('/')
|
||||
.last()
|
||||
.unwrap()
|
||||
.parse::<isize>()
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
season.insert(
|
||||
"title".into(),
|
||||
b.get_element_text_by_xpath(
|
||||
r#"//*[@id="main"]//span[@class="flex poster"]/span/div/h2/a"#,
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
if let Some(release_year) = b
|
||||
.get_element_text_by_xpath(
|
||||
r#"//*[@id="main"]//span[@class="flex poster"]/span/div/h2/span"#,
|
||||
)
|
||||
.await
|
||||
{
|
||||
season.insert(
|
||||
"release_year".into(),
|
||||
remove_last_n_chars(&release_year[1..], 1).into(),
|
||||
);
|
||||
}
|
||||
|
||||
season.insert(
|
||||
"amount_of_episodes".into(),
|
||||
b.get_element_text_by_xpath(
|
||||
r#"//*[@id="main_column"]//h3[@class="episode_sort space"]/span"#,
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
.parse::<isize>()
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
|
||||
let mut episodes: Vec<Map<String, Value>> = vec![];
|
||||
let episodes_html = b
|
||||
.get_element_by_xpath(r#"//*[@id="main_column"]//div[@class="episode_list"]"#)
|
||||
.await
|
||||
.unwrap();
|
||||
for e in episodes_html
|
||||
.find_all(By::XPath(r#"./div[@class="card"]"#))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
let mut episode: Map<String, Value> = Map::new();
|
||||
episode.insert(
|
||||
"episode_number".into(),
|
||||
text_from!(e
|
||||
.find(By::XPath(r#".//span[@class="episode_number"]"#))
|
||||
.await
|
||||
.unwrap())
|
||||
.parse::<isize>()
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
episode.insert(
|
||||
"title".into(),
|
||||
text_from!(e
|
||||
.find(By::XPath(r#".//div[@class="episode_title"]//a"#))
|
||||
.await
|
||||
.unwrap())
|
||||
.into(),
|
||||
);
|
||||
episode.insert(
|
||||
"rating".into(),
|
||||
text_from!(e
|
||||
.find(By::XPath(
|
||||
r#".//div[@class="episode_title"]/div[1]/div[1]/div[1]"#
|
||||
))
|
||||
.await
|
||||
.unwrap())
|
||||
.parse::<f64>()
|
||||
.unwrap()
|
||||
.into(),
|
||||
);
|
||||
if let Ok(release_date) = e
|
||||
.find(By::XPath(
|
||||
r#".//div[@class="episode_title"]//div[@class="date"]/span[@class="date"]"#,
|
||||
))
|
||||
.await
|
||||
{
|
||||
episode.insert(
|
||||
"release_date".into(),
|
||||
parse_date(&text_from!(release_date), "%B %d, %Y").into(),
|
||||
);
|
||||
}
|
||||
episode.insert("runtime".into(), text_from!(e.find(By::XPath(r#".//div[@class="episode_title"]//div[@class="date"]/span[@class="runtime"]"#)).await.unwrap()).into());
|
||||
|
||||
episode.insert(
|
||||
"overview".into(),
|
||||
text_from!(e
|
||||
.find(By::XPath(
|
||||
r#".//div[@class="info"]//div[@class="overview"]/p"#
|
||||
))
|
||||
.await
|
||||
.unwrap())
|
||||
.into(),
|
||||
);
|
||||
episodes.push(episode);
|
||||
}
|
||||
|
||||
season.insert("episodes".into(), episodes.into());
|
||||
|
||||
Ok(season)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Extractor for TmdbExtractor {
|
||||
fn supported_hosts(&self) -> Vec<&str> {
|
||||
vec!["www.themoviedb.org"]
|
||||
}
|
||||
|
||||
fn name(&self) -> String {
|
||||
"TMDB".to_string()
|
||||
}
|
||||
|
||||
async fn run_scrape(
|
||||
&self,
|
||||
url: Url,
|
||||
browser: &mut crate::Browser,
|
||||
conf: &Config,
|
||||
) -> Result<HashMap<String, Value>, String> {
|
||||
self.series(url, browser, conf).await
|
||||
}
|
||||
}
|
356
src/lib.rs
Normal file
356
src/lib.rs
Normal file
|
@ -0,0 +1,356 @@
|
|||
use std::{
|
||||
ops::Deref,
|
||||
process::{Child, Command, Stdio},
|
||||
str::FromStr,
|
||||
};
|
||||
use strum::EnumVariantNames;
|
||||
pub mod extractors;
|
||||
pub mod util;
|
||||
use thirtyfour::prelude::*;
|
||||
|
||||
/// A convenience macro for extracting text content from a web element expression.
|
||||
///
|
||||
/// This macro takes a single expression `$expr` that represents a web element. It uses
|
||||
/// the `.text()` method to asynchronously extract text content from the web element and
|
||||
/// immediately unwraps the result. This macro is useful for simplifying the process of
|
||||
/// extracting text content from web elements.
|
||||
#[macro_export]
|
||||
macro_rules! text_from {
|
||||
($expr:expr) => {
|
||||
$expr.text().await.unwrap()
|
||||
};
|
||||
}
|
||||
|
||||
/// A convenience macro for extracting an attribute's value from a web element expression.
|
||||
///
|
||||
/// This macro takes two expressions as arguments: `$expr`, which represents a web element,
|
||||
/// and `$attr`, which is the name of the attribute to extract. It uses the `.attr()` method
|
||||
/// to asynchronously extract the value of the specified attribute from the web element and
|
||||
/// immediately unwraps the result. This macro simplifies the process of attribute extraction.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// If the attribute is not present, the macro will panic when trying to unwrap it.
|
||||
#[macro_export]
|
||||
macro_rules! attr_from {
|
||||
($expr:expr, $attr:expr) => {
|
||||
$expr.attr($attr).await.unwrap().unwrap()
|
||||
};
|
||||
}
|
||||
|
||||
/// A convenience macro for attempting to extract an attribute's value from a web element expression.
|
||||
///
|
||||
/// This macro takes two expressions as arguments: `$expr`, which represents a web element,
|
||||
/// and `$attr`, which is the name of the attribute to extract. It uses the `.attr()` method
|
||||
/// to asynchronously extract the value of the specified attribute from the web element. If the
|
||||
/// attribute is not present, it returns an `Option` with `None`.
|
||||
///
|
||||
/// # Note
|
||||
///
|
||||
/// This macro returns an `Option` containing the attribute value, which can be either `Some(value)`
|
||||
/// or `None` if the attribute is absent.
|
||||
#[macro_export]
|
||||
macro_rules! try_attr_from {
|
||||
($expr:expr, $attr:expr) => {
|
||||
$expr.attr($attr).await.unwrap()
|
||||
};
|
||||
}
|
||||
|
||||
/// A struct representing a web browser instance.
|
||||
pub struct Browser {
|
||||
driver: Option<WebDriver>,
|
||||
cmd: Child,
|
||||
}
|
||||
|
||||
impl Browser {
|
||||
pub async fn new(conf: &Config) -> Option<Self> {
|
||||
// TODO : setup http proxy
|
||||
let mut caps = DesiredCapabilities::chrome();
|
||||
let child = Command::new("chromedriver")
|
||||
.args(vec!["-p", "9515"])
|
||||
.stdout(Stdio::null())
|
||||
.stderr(Stdio::null())
|
||||
.spawn()
|
||||
.ok()?;
|
||||
if let Some(http_proxy) = conf.http_proxy.clone() {
|
||||
let (_host, _port) = http_proxy.split_once(':').unwrap();
|
||||
caps.add_chrome_arg(&format!("--proxy-server={http_proxy}"))
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
let driver = WebDriver::new("http://localhost:9515", caps).await.ok()?;
|
||||
Some(Self {
|
||||
driver: Some(driver),
|
||||
cmd: child,
|
||||
})
|
||||
}
|
||||
|
||||
/// Scrolls to the end of the web page using the browser's `WebDriver`.
|
||||
///
|
||||
/// This asynchronous method is used to scroll to the bottom of a web page by executing a JavaScript
|
||||
/// script using the browser's `WebDriver`. It takes no arguments and returns a `Result` containing
|
||||
/// either a `ScriptRet` indicating the script execution result or a `WebDriverError` if an error occurs.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `Result` containing either a `ScriptRet` indicating the script execution result or a `WebDriverError`.
|
||||
pub async fn scroll_to_end(&self) -> Result<ScriptRet, WebDriverError> {
|
||||
self.driver
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.execute(
|
||||
"window.scrollTo(0, document.body.scrollHeight);",
|
||||
Vec::new(),
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Clicks on a web element using its `XPath` expression.
|
||||
///
|
||||
/// This asynchronous method is used to locate a web element using its `XPath` expression,
|
||||
/// and then perform a click action on it.
|
||||
///
|
||||
/// The method does not return any value but panics if the operation fails.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `xpath` - The `XPath` expression used to locate the web element.
|
||||
pub async fn click_on_xpath(&self, xpath: &str) {
|
||||
self.get_element_by_xpath(xpath)
|
||||
.await
|
||||
.unwrap()
|
||||
.click()
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
/// Scrolls to a specific web element using the browser's `WebDriver`.
|
||||
///
|
||||
/// This asynchronous method is used to scroll to a specific web element by executing a JavaScript
|
||||
/// script using the browser's `WebDriver`.
|
||||
///
|
||||
/// The method returns a `Result` containing either a `ScriptRet` indicating the script execution
|
||||
/// result or a `WebDriverError` if an error occurs.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `e` - A reference to the web element to scroll into view.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `Result` containing either a `ScriptRet` indicating the script execution result or a `WebDriverError`.
|
||||
pub async fn scroll_to_element(&self, e: &WebElement) -> Result<ScriptRet, WebDriverError> {
|
||||
self.driver
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.execute("arguments[0].scrollIntoView();", vec![e.to_json()?])
|
||||
.await
|
||||
}
|
||||
|
||||
/// Retrieves a complete URL from a link element using the browser's `WebDriver`.
|
||||
///
|
||||
/// This asynchronous method is used to retrieve a complete URL from a web element
|
||||
/// by extracting its "href" attribute and appending it to the base URL of the current page.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `el` - The web element from which to extract the URL.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A string representing the complete URL derived from the link element.
|
||||
pub async fn get_url_from_link(&self, el: WebElement) -> String {
|
||||
let url = attr_from!(el, "href");
|
||||
self.get_absolute_url(&url).await
|
||||
}
|
||||
|
||||
/// Converts a URL to an absolute URL based on the current page's URL.
|
||||
///
|
||||
/// This function takes a relative or absolute `url` as input and returns the
|
||||
/// corresponding absolute URL. If the input `url` is a relative URL, it is converted
|
||||
/// to an absolute URL using the current page's URL as the base. If the input `url` is
|
||||
/// already an absolute URL, it is returned as is.
|
||||
///
|
||||
/// # Parameters
|
||||
///
|
||||
/// - `url`: A string slice representing the relative or absolute URL to be converted.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `String` containing the absolute URL.
|
||||
pub async fn get_absolute_url(&self, url: &str) -> String {
|
||||
if let Err(url::ParseError::RelativeUrlWithoutBase) = url::Url::parse(url) {
|
||||
let mut current_url = self.current_url().await.unwrap();
|
||||
current_url.set_query(None);
|
||||
current_url.set_path(url);
|
||||
return current_url.to_string();
|
||||
}
|
||||
url.to_string()
|
||||
}
|
||||
|
||||
/// Waits for a web element to be present using the browser's `WebDriver`.
|
||||
///
|
||||
/// This asynchronous method is used to wait for a web element to be present in the
|
||||
/// DOM using its `XPath` expression. It takes an `XPath` string and a timeout duration
|
||||
/// as arguments and performs the following actions:
|
||||
///
|
||||
/// 1. Queries for the web element using the provided `XPath` expression.
|
||||
/// 2. Waits for the web element to exist within the specified timeout duration.
|
||||
/// 3. Returns `true` if the web element is found within the timeout, otherwise `false`.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `xpath` - The `XPath` expression used to locate the web element.
|
||||
/// * `timeout` - The maximum duration to wait for the web element to appear.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A boolean value indicating whether the web element was found within the timeout.
|
||||
pub async fn wait_for(&self, xpath: &str, timeout: std::time::Duration) -> bool {
|
||||
self.driver
|
||||
.as_ref()
|
||||
.unwrap()
|
||||
.query(By::XPath(xpath))
|
||||
.wait(timeout, std::time::Duration::new(0, 500))
|
||||
.exists()
|
||||
.await
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Retrieves the text content of a web element using its `XPath` expression.
|
||||
///
|
||||
/// This asynchronous method is used to locate a web element using its `XPath` expression
|
||||
/// and retrieve its text content.
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `xpath` - The `XPath` expression used to locate the web element.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// An `Option` containing the text content of the web element, or `None` if not found.
|
||||
pub async fn get_element_text_by_xpath(&self, xpath: &str) -> Option<String> {
|
||||
self.find(By::XPath(xpath)).await.ok()?.text().await.ok()
|
||||
}
|
||||
|
||||
/// Retrieves the value of a specific attribute from a web element using its `XPath` expression.
|
||||
///
|
||||
/// This asynchronous method is used to locate a web element using its `XPath` expression
|
||||
/// and retrieve the value of a specific attribute.
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `xpath` - The `XPath` expression used to locate the web element.
|
||||
/// * `attr` - The name of the attribute to retrieve.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// An `Option` containing the value of the specified attribute, or `None` if not found.
|
||||
pub async fn get_element_attr_by_xpath(&self, xpath: &str, attr: &str) -> Option<String> {
|
||||
self.find(By::XPath(xpath))
|
||||
.await
|
||||
.ok()?
|
||||
.attr(attr)
|
||||
.await
|
||||
.ok()?
|
||||
}
|
||||
|
||||
/// Retrieves a list of web elements using their `XPath` expression.
|
||||
///
|
||||
/// This asynchronous method is used to locate multiple web elements using a common `XPath` expression.
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `xpath` - The `XPath` expression used to locate the web elements.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// An `Option` containing a vector of located web elements, or `None` if none are found.
|
||||
pub async fn get_elements_by_xpath(&self, xpath: &str) -> Option<Vec<WebElement>> {
|
||||
self.find_all(By::XPath(xpath)).await.ok()
|
||||
}
|
||||
|
||||
/// Retrieves a single web element using its `XPath` expression.
|
||||
///
|
||||
/// This asynchronous method is used to locate a single web element using its `XPath` expression.
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `xpath` - The `XPath` expression used to locate the web element.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// An `Option` containing the located web element, or `None` if not found.
|
||||
pub async fn get_element_by_xpath(&self, xpath: &str) -> Option<WebElement> {
|
||||
self.find(By::XPath(xpath)).await.ok()
|
||||
}
|
||||
|
||||
/// Quits the browser instance and `WebDriver` process.
|
||||
///
|
||||
/// This asynchronous method is used to gracefully quit the browser instance and the associated `WebDriver` process.
|
||||
pub async fn quit(mut self) {
|
||||
let b = self.driver.take().unwrap();
|
||||
b.quit().await.unwrap();
|
||||
self.cmd.kill().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Browser {
|
||||
type Target = WebDriver;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.driver.as_ref().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
#[derive(Debug, EnumVariantNames, Clone)]
|
||||
pub enum Language {
|
||||
en_US,
|
||||
de_DE,
|
||||
}
|
||||
|
||||
impl FromStr for Language {
|
||||
type Err = ();
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
match s {
|
||||
"en_US" => Ok(Language::en_US),
|
||||
"de_DE" => Ok(Language::de_DE),
|
||||
_ => Err(()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ToString for Language {
|
||||
fn to_string(&self) -> String {
|
||||
match self {
|
||||
Language::en_US => "en_US".to_string(),
|
||||
Language::de_DE => "de_DE".to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Config {
|
||||
/// Save a timestamp alongside the scraped data
|
||||
pub save_ts: bool,
|
||||
/// Set the desired language for the extractor
|
||||
pub language: Language,
|
||||
/// Download media urls to disk
|
||||
pub download_media: bool,
|
||||
/// URL of the HTTP Proxy to use
|
||||
pub http_proxy: Option<String>,
|
||||
/// Embed media urls as data urls
|
||||
pub embed_media: bool,
|
||||
/// Force a specific extractor
|
||||
pub force_extractor: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
save_ts: false,
|
||||
language: Language::en_US,
|
||||
download_media: false,
|
||||
http_proxy: None,
|
||||
embed_media: false,
|
||||
force_extractor: None,
|
||||
}
|
||||
}
|
||||
}
|
128
src/main.rs
Normal file
128
src/main.rs
Normal file
|
@ -0,0 +1,128 @@
|
|||
use clap::{App, Arg};
|
||||
use scrape::{Config, Language};
|
||||
use std::{io::Write, str::FromStr};
|
||||
use strum::VariantNames;
|
||||
|
||||
#[must_use]
|
||||
pub fn cli_args() -> clap::ArgMatches<'static> {
|
||||
App::new("Web Scraper")
|
||||
/* .arg(Arg::with_name("sites")
|
||||
.long("sites")
|
||||
.takes_value(false)
|
||||
.multiple(false)
|
||||
.help("Show all supported sites"))*/
|
||||
.arg(
|
||||
Arg::with_name("t")
|
||||
.short("t")
|
||||
.long("timestamp")
|
||||
.help("Store timestamp when scraping"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("d")
|
||||
.short("d")
|
||||
.long("download")
|
||||
.help("Download any found media urls"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("lang")
|
||||
.long("lang")
|
||||
.takes_value(true)
|
||||
.required(false)
|
||||
.possible_values(Language::VARIANTS)
|
||||
.default_value("en_US")
|
||||
.help("Desired language to scrape in"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("http-proxy")
|
||||
.long("http-proxy")
|
||||
.takes_value(true)
|
||||
.help("HTTP Proxy"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("extractor")
|
||||
.long("extractor")
|
||||
.help("Force specific extractor")
|
||||
.possible_values(
|
||||
&scrape::extractors::get_extractors()
|
||||
.iter()
|
||||
.map(|x| x.name())
|
||||
.collect::<Vec<String>>()
|
||||
.iter()
|
||||
.map(std::string::String::as_str)
|
||||
.collect::<Vec<&str>>(),
|
||||
)
|
||||
.takes_value(true)
|
||||
.required(false),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("e")
|
||||
.short("e")
|
||||
.long("embed-media")
|
||||
.help("Embed media urls as data urls"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("url")
|
||||
.required(true)
|
||||
.index(1)
|
||||
.help("URL to scrape"),
|
||||
)
|
||||
.get_matches()
|
||||
}
|
||||
|
||||
fn setup_logger() {
|
||||
let mut logger = env_logger::builder();
|
||||
#[cfg(debug_assertions)]
|
||||
logger.filter_level(log::LevelFilter::Trace);
|
||||
#[cfg(not(debug_assertions))]
|
||||
logger.filter_level(log::LevelFilter::Info);
|
||||
|
||||
logger
|
||||
.format(|buf, record| {
|
||||
use log::Level;
|
||||
|
||||
let level = record.level();
|
||||
let color = match level {
|
||||
Level::Error => "\x1b[31m",
|
||||
Level::Warn => "\x1b[33m",
|
||||
Level::Info => "\x1b[32m",
|
||||
Level::Debug => "\x1b[34m",
|
||||
Level::Trace => "\x1b[35m",
|
||||
};
|
||||
|
||||
writeln!(
|
||||
buf,
|
||||
"{}{}\x1b[0m [{}]: {}",
|
||||
color,
|
||||
record.metadata().level().to_string().to_uppercase(),
|
||||
record.metadata().target(),
|
||||
record.args()
|
||||
)
|
||||
})
|
||||
.init();
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
setup_logger();
|
||||
let matches = cli_args();
|
||||
|
||||
let http_proxy = matches.value_of("http-proxy");
|
||||
let url = matches.value_of("url").unwrap();
|
||||
|
||||
let conf = Config {
|
||||
save_ts: matches.is_present("t"),
|
||||
language: Language::from_str(matches.value_of("lang").unwrap()).expect("unknown language"),
|
||||
download_media: matches.is_present("d"),
|
||||
http_proxy: if http_proxy.is_some() {
|
||||
Some(http_proxy.unwrap().to_owned())
|
||||
} else {
|
||||
None
|
||||
},
|
||||
embed_media: matches.is_present("e"),
|
||||
force_extractor: matches
|
||||
.value_of("extractor")
|
||||
.map(std::string::ToString::to_string),
|
||||
};
|
||||
|
||||
scrape::extractors::scrape_url(url, &conf).await;
|
||||
}
|
395
src/util.rs
Normal file
395
src/util.rs
Normal file
|
@ -0,0 +1,395 @@
|
|||
use std::{io::Write, process::Stdio};
|
||||
|
||||
use base64::Engine;
|
||||
use regex::Regex;
|
||||
use thirtyfour::WebElement;
|
||||
|
||||
use crate::{attr_from, text_from, Config};
|
||||
|
||||
/// Removes the last `n` characters from the given string `input` and returns a new `String`
|
||||
/// containing the modified content.
|
||||
///
|
||||
/// This function takes a reference to a string `input` and an unsigned integer `n`. It then
|
||||
/// creates a new string containing all characters of `input` except for the last `n` characters.
|
||||
/// The resulting modified string is returned.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `input` - The input string from which characters will be removed.
|
||||
/// * `n` - The number of characters to remove from the end of the string.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A new `String` containing the modified content with the last `n` characters removed.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use scrape::util::remove_last_n_chars;
|
||||
///
|
||||
/// let input = "example";
|
||||
/// let modified = remove_last_n_chars(input, 3);
|
||||
/// assert_eq!(modified, "exam");
|
||||
/// ```
|
||||
#[must_use]
|
||||
pub fn remove_last_n_chars(input: &str, n: usize) -> String {
|
||||
input[..input.len() - n].to_string()
|
||||
}
|
||||
|
||||
/// Extracts overlapping windows of a specified size from a vector.
|
||||
///
|
||||
/// Given a vector `lst` and a window `size`, this function extracts overlapping
|
||||
/// windows of the specified size from the vector. The windows are extracted in a
|
||||
/// stride-like fashion, where every `size` elements are skipped before extracting
|
||||
/// the next window.
|
||||
///
|
||||
/// # Parameters
|
||||
///
|
||||
/// - `lst`: A vector of elements from which windows will be extracted.
|
||||
/// - `size`: The size of each window.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A vector of vectors, where each inner vector represents an extracted window
|
||||
/// of elements from the input vector.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust
|
||||
/// use scrape::util::window;
|
||||
///
|
||||
/// let input: Vec<&str> = vec!["r", "u", "s", "t"];
|
||||
/// let out = window(input, 2);
|
||||
/// assert_eq!(out, vec![vec!["r", "u"], vec!["s", "t"]]);
|
||||
///
|
||||
/// let input: Vec<&str> = vec!["a", "b", "c", "d", "e", "f", "g", "h", "i"];
|
||||
/// let out = window(input, 3);
|
||||
/// assert_eq!(out, vec![vec!["a", "b", "c"], vec!["d", "e", "f"], vec!["g", "h", "i"]]);
|
||||
/// ```
|
||||
pub fn window<T: Clone>(lst: &[T], size: usize) -> Vec<Vec<T>> {
|
||||
let mut result = Vec::new();
|
||||
|
||||
let mut wd = lst.windows(size);
|
||||
|
||||
for i in 0..wd.len() {
|
||||
let window = wd.next().unwrap();
|
||||
if i % size != 0 {
|
||||
continue;
|
||||
}
|
||||
result.push(window.to_owned());
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// Escapes unsafe characters in the given filename and returns a new `String` with
|
||||
/// the unsafe characters replaced by underscores.
|
||||
///
|
||||
/// This function takes a reference to a filename string and scans it for characters that
|
||||
/// are considered unsafe in filenames, such as `<`, `>`, `:`, `"`, `/`, `\`, `|`, `?`, `*`,
|
||||
/// control characters (0x00-0x1F), and DEL (0x7F). It then replaces all occurrences of such
|
||||
/// unsafe characters with underscores (`_`) in the filename and returns the modified string.
|
||||
///
|
||||
/// The function uses the `regex` crate to perform the replacement.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `filename` - The filename string containing unsafe characters.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A new `String` with unsafe characters replaced by underscores.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use scrape::util::escape_unsafe_characters;
|
||||
///
|
||||
/// let filename = "hello/world?.txt";
|
||||
/// let escaped = escape_unsafe_characters(filename);
|
||||
/// assert_eq!(escaped, "hello_world_.txt");
|
||||
/// ```
|
||||
#[must_use]
|
||||
pub fn escape_unsafe_characters(filename: &str) -> String {
|
||||
let unsafe_chars = r#"[<>:"/\\|?*\x00-\x1F\x7F]"#;
|
||||
let re = Regex::new(unsafe_chars).unwrap();
|
||||
re.replace_all(filename, "_").to_string()
|
||||
}
|
||||
|
||||
/// Parses a date string using the provided format and returns the parsed date
|
||||
/// in the "YYYY-MM-DD" format as a `String`.
|
||||
///
|
||||
/// This function takes a reference to a date string and a format string that specifies
|
||||
/// the expected format of the input date. It attempts to parse the input date using the
|
||||
/// given format and returns an `Option<String>` containing the parsed date in the
|
||||
/// "YYYY-MM-DD" format if the parsing is successful. If parsing fails, `None` is returned.
|
||||
///
|
||||
/// The function uses the `chrono` crate to handle date parsing and formatting.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `date` - The input date string to be parsed.
|
||||
/// * `format` - The format string specifying the expected format of the input date.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// An `Option<String>` containing the parsed date in "YYYY-MM-DD" format if parsing is successful,
|
||||
/// otherwise `None`.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use scrape::util::parse_date;
|
||||
///
|
||||
/// let date_str = "20-08-2023";
|
||||
/// let format_str = "%d-%m-%Y";
|
||||
/// let parsed = parse_date(date_str, format_str);
|
||||
/// assert_eq!(parsed, Some(String::from("2023-08-20")));
|
||||
/// ```
|
||||
#[must_use]
|
||||
pub fn parse_date(date: &str, format: &str) -> Option<String> {
|
||||
let date = chrono::NaiveDate::parse_from_str(date, format).ok()?;
|
||||
Some(date.format("%Y-%m-%d").to_string())
|
||||
}
|
||||
|
||||
/// Handles a media URL based on the provided configuration, downloading, saving, and
|
||||
/// potentially embedding the media content as a data URL.
|
||||
///
|
||||
/// This asynchronous function takes a reference to a URL, a file name, a boolean flag
|
||||
/// indicating whether to use the raw file name, and a reference to a `Config` instance.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `url` - The URL of the media content.
|
||||
/// * `file_name` - The desired file name for saving the media content.
|
||||
/// * `raw_file_name` - A flag indicating whether to use the raw file name.
|
||||
/// * `conf` - A reference to a `Config` instance containing configuration settings.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `String` representing the Data URL or the original URL.
|
||||
pub async fn handle_media_url(
|
||||
url: &str,
|
||||
file_name: &str,
|
||||
raw_file_name: bool,
|
||||
conf: &Config,
|
||||
) -> String {
|
||||
let file_name = escape_unsafe_characters(file_name);
|
||||
if conf.download_media || conf.embed_media {
|
||||
let data = download(url, conf).await;
|
||||
if data.is_err() {
|
||||
log::error!("Downloading '{url}' failed");
|
||||
return url.to_string();
|
||||
}
|
||||
let data = data.unwrap();
|
||||
if conf.download_media {
|
||||
if raw_file_name {
|
||||
save_raw(url, &data, &file_name);
|
||||
} else {
|
||||
save(url, &file_name, &data);
|
||||
}
|
||||
}
|
||||
if conf.embed_media {
|
||||
return to_data_url(&data);
|
||||
}
|
||||
}
|
||||
url.to_string()
|
||||
}
|
||||
|
||||
/// Escapes a given string by replacing spaces with underscores and converting it to lowercase.
|
||||
///
|
||||
/// This function takes a reference to a string `s` and performs the following actions:
|
||||
///
|
||||
/// 1. Replaces all occurrences of space characters (' ') with underscores ('_').
|
||||
/// 2. Converts the entire string to lowercase.
|
||||
///
|
||||
/// The modified string is then returned.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `s` - The input string to be escaped.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A new `String` with spaces replaced by underscores and converted to lowercase.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// use scrape::util::escape_key;
|
||||
///
|
||||
/// let original = "Hello World";
|
||||
/// let escaped = escape_key(original);
|
||||
/// assert_eq!(escaped, "hello_world");
|
||||
/// ```
|
||||
#[must_use]
|
||||
pub fn escape_key(s: &str) -> String {
|
||||
s.replace(' ', "_").to_lowercase()
|
||||
}
|
||||
|
||||
/// Converts binary data into a data URL string.
|
||||
///
|
||||
/// This function takes a reference to a slice of bytes `data` and performs the following actions:
|
||||
///
|
||||
/// The `file` command is used to determine the MIME type of the data by reading from stdin.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `data` - The binary data to be converted to a data URL.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `String` containing the data URL.
|
||||
#[must_use]
|
||||
pub fn to_data_url(data: &[u8]) -> String {
|
||||
let mut file_cmd = std::process::Command::new("file")
|
||||
.arg("--mime-type")
|
||||
.arg("-")
|
||||
.stdin(Stdio::piped())
|
||||
.stdout(Stdio::piped())
|
||||
.spawn()
|
||||
.unwrap();
|
||||
|
||||
{
|
||||
let mut stdin = file_cmd.stdin.take().unwrap();
|
||||
stdin.write_all(data).unwrap();
|
||||
}
|
||||
|
||||
let out = file_cmd.wait_with_output().expect("gpg executable error");
|
||||
let stdout = String::from_utf8_lossy(&out.stdout).to_string();
|
||||
|
||||
let mime_type = remove_last_n_chars(&stdout.replace("/dev/stdin: ", ""), 1);
|
||||
let base64_data = base64::engine::general_purpose::STANDARD.encode(data);
|
||||
format!("data:{mime_type};base64,{base64_data}")
|
||||
}
|
||||
|
||||
/// Downloads content from the provided URL using the given configuration.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `url` - The URL from which to download content.
|
||||
/// * `conf` - A reference to a `Config` instance containing optional proxy settings.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `Result` containing either the downloaded content as a `Vec<u8>` or an error message.
|
||||
pub async fn download(url: &str, conf: &crate::Config) -> Result<Vec<u8>, String> {
|
||||
let mut client_builder = reqwest::Client::builder();
|
||||
if conf.http_proxy.is_some() {
|
||||
let proxy = reqwest::Proxy::http(conf.http_proxy.clone().unwrap())
|
||||
.ok()
|
||||
.ok_or("could not create proxy")?;
|
||||
client_builder = client_builder.proxy(proxy);
|
||||
}
|
||||
let user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36";
|
||||
let client = client_builder
|
||||
.user_agent(user_agent)
|
||||
.build()
|
||||
.ok()
|
||||
.ok_or("could not create client")?;
|
||||
|
||||
let resp = client.get(url).send().await.ok().ok_or("request failed")?;
|
||||
|
||||
if resp.status() == reqwest::StatusCode::OK {
|
||||
let data = resp
|
||||
.bytes()
|
||||
.await
|
||||
.ok()
|
||||
.ok_or("could not get response body")?;
|
||||
return Ok(data.to_vec());
|
||||
}
|
||||
|
||||
Err(format!("Request failed with Status {}", resp.status()))
|
||||
}
|
||||
|
||||
fn save_raw(url: &str, data: &[u8], file_name: &str) {
|
||||
match std::fs::write(file_name, data) {
|
||||
Ok(()) => {
|
||||
log::info!("Saved '{url}' to '{file_name}'");
|
||||
}
|
||||
Err(e) => {
|
||||
log::error!("Error saving '{url}': {e:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn save(url: &str, file_name: &str, data: &[u8]) {
|
||||
let p_url = url::Url::parse(url).unwrap();
|
||||
let path_segments: Vec<_> = p_url.path_segments().unwrap().collect();
|
||||
let file_ending = (*path_segments.last().unwrap_or(&"")).to_string();
|
||||
let file_name = format!("{file_name}.{file_ending}");
|
||||
save_raw(url, data, &file_name);
|
||||
}
|
||||
|
||||
/// Extracts text content from a collection of web elements asynchronously.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `v` - A `Vec<WebElement>` from which to extract text content.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `Vec<String>` containing the extracted text content from the `WebElement`.
|
||||
pub async fn extract_texts_from_elements(v: Vec<WebElement>) -> Vec<String> {
|
||||
let mut ret: Vec<_> = vec![];
|
||||
for e in v {
|
||||
ret.push(text_from!(e));
|
||||
}
|
||||
ret
|
||||
}
|
||||
|
||||
/// Extracts an attribute from a collection of web elements asynchronously.
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `v` - A `Vec<WebElement>` from which to extract attribute.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A `Vec<String>` containing the extracted attribute from the `WebElement`.
|
||||
pub async fn extract_attrs_from_elements(v: Vec<WebElement>, attr: &str) -> Vec<String> {
|
||||
let mut ret: Vec<_> = vec![];
|
||||
for e in v {
|
||||
ret.push(attr_from!(e, attr));
|
||||
}
|
||||
ret
|
||||
}
|
||||
|
||||
/// Parses a string containing a currency value and symbol into a JSON map.
|
||||
///
|
||||
/// This function takes a reference to a string `v` representing a currency value along
|
||||
/// with its symbol (e.g., "$123.45").
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `v` - The input string containing a currency value and symbol.
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// A JSON map with keys "currency" and "value" representing the currency symbol and value, respectively.
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```rust
|
||||
/// use scrape::util::currency;
|
||||
///
|
||||
/// let currency_str = "123.45$";
|
||||
/// let json_map = currency(currency_str);
|
||||
/// assert_eq!(json_map.get("currency").unwrap().as_str().unwrap(), "$");
|
||||
/// assert_eq!(json_map.get("value").unwrap().as_f64().unwrap(), 123.45);
|
||||
/// ```
|
||||
#[must_use]
|
||||
pub fn currency(v: &str) -> serde_json::Map<String, serde_json::Value> {
|
||||
let re = Regex::new(r"^([\d,.]+)([^\d,.]+)$").unwrap();
|
||||
|
||||
let captures = re.captures(v).unwrap();
|
||||
let value_str = captures.get(1).unwrap().as_str().replace(',', ".");
|
||||
let value = value_str.parse::<f64>().unwrap();
|
||||
let currency_symbol = captures.get(2).unwrap().as_str().to_string();
|
||||
|
||||
let mut result: serde_json::Map<String, serde_json::Value> = serde_json::Map::new();
|
||||
result.insert("currency".to_string(), currency_symbol.into());
|
||||
result.insert("value".to_string(), value.into());
|
||||
|
||||
result
|
||||
}
|
Loading…
Reference in a new issue