use log::*; use threadpool::ThreadPool; use colored::*; use linkify::{LinkFinder, LinkKind}; use std::fs::File; use std::io::prelude::*; use reqwest::header; use std::time::Duration; use std::borrow::Cow; use std::sync::Arc; use std::sync::Mutex; use std::sync::atomic::Ordering; use rustc_hash::{FxHashMap, FxHashSet}; enum UrlStatus { Unknown, UrlOk, UrlError(String), } struct HashVal { paths: FxHashSet, status: UrlStatus, } type UrlHash = FxHashMap; pub struct LinkCheck { pool: Mutex, urlhash: Arc>, print_all: bool, } impl LinkCheck { pub fn new(num_threads: usize, print_all: bool) -> LinkCheck { openssl_probe::init_ssl_cert_env_vars(); let pool = Mutex::new(ThreadPool::new(num_threads)); LinkCheck { pool, urlhash: Arc::new(Mutex::new(FxHashMap::default())), print_all, } } pub fn check_urls(&self, fname: &str) { let print_all = self.print_all; if let Some(links) = get_links(fname) { for l in links { let urlhash = self.urlhash.clone(); let fname_s = String::from(fname); self.pool.lock().unwrap().execute(move || { check_link(&l, &fname_s, &urlhash, print_all); }); } } } } impl Drop for LinkCheck { fn drop(&mut self) { //println!("Now dropping ..."); let pool = self.pool.lock().unwrap(); pool.join(); } } fn check_link(url: &str, fname: &str, urlhash: &Arc>, print_all: bool) { let url = String::from(url); let mut run_check_link = false; // It is very important to keep the lock for the urlhash // only for a short period of time // // If we don't find the url in the urlhash then // we set `run_check_link` to `true` so that we will // check the url { let f = String::from(fname); let mut urlhash = urlhash.lock().unwrap(); if !urlhash.contains_key(&url) { let mut hs = FxHashSet::default(); hs.insert(f); let url1 = url.clone(); urlhash.insert( url1, HashVal { status: UrlStatus::Unknown, paths: hs, }, ); run_check_link = true; } else if let Some(hs) = urlhash.get_mut(&url) { match &hs.status { UrlStatus::Unknown => { hs.paths.insert(f); } UrlStatus::UrlOk => { if print_all { print_ok(super::ARGS.no_colors, &url, &f); }; } UrlStatus::UrlError(e) => { e0022!(f, e); } } } } // if run_check_link { match check_link_inner(&url, true) { UrlStatus::UrlOk => { let mut urlhash = urlhash.lock().unwrap(); if let Some(mut hs) = urlhash.get_mut(&url) { if print_all { for p in hs.paths.iter() { print_ok(super::ARGS.no_colors, &url, p); } } hs.status = UrlStatus::UrlOk; } } UrlStatus::UrlError(e) => { let mut urlhash = urlhash.lock().unwrap(); if let Some(mut hs) = urlhash.get_mut(&url) { for p in hs.paths.iter() { e0022!(p, e); } hs.status = UrlStatus::UrlError(e); } } _ => (), } } } fn get_links(fname: &str) -> Option> { let fhdl = File::open(fname); match fhdl { Ok(mut f) => { let mut buf = Vec::new(); match f.read_to_end(&mut buf) { Ok(_bytes_read) => { return get_links_inner(&String::from_utf8_lossy(&buf)); } Err(e) => error!("Error reading file {}: {:?}", fname, e), } } Err(e) => error!("Error opening file {}: {}", fname, e), } None } fn resolve_entities(url: &str) -> Cow<'static, str> { if !url.contains('&') { return String::from(url).into(); } let v = vec![("ö", "ö"), ("ü", "ü")]; let mut url_new = String::from(url); for (e, r) in v { url_new = url_new.replace(e, r); } url_new.into() } // retrieves links in a string and then checks those links fn get_links_inner(s: &str) -> Option> { let mut finder = LinkFinder::new(); finder.kinds(&[LinkKind::Url]); // finder.links() does the actual search for URLs let links: Vec<_> = finder.links(s).collect(); let result: Vec<&str> = links.iter().map(|e| e.as_str()).collect(); let mut links = vec![]; for r in result { if !r.starts_with("http://") && !r.starts_with("https://") && !r.starts_with("ftp://") { continue; } // This is a workaround to prevent URLs ending with certain characters let url = resolve_entities(r.trim_end_matches(|c| c == '。' || c == '`')); links.push(url.into()); } if !links.is_empty() { Some(links) } else { None } } fn check_link_inner(l: &str, head: bool) -> UrlStatus { let mut headers = header::HeaderMap::new(); headers.insert( header::USER_AGENT, header::HeaderValue::from_static( "Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0", ), ); let default_policy = reqwest::redirect::Policy::default(); let policy = reqwest::redirect::Policy::custom(move |attempt| { if attempt.url().host_str() == Some("127.0.0.1") { attempt.stop() } else { default_policy.redirect(attempt) } }); let cb = reqwest::blocking::Client::builder() .gzip(true) .redirect(policy) .default_headers(headers) .timeout(Duration::from_secs(7)) .build() .unwrap(); // let url: Url = // match l.parse() { // Ok(url) => url, // Err(e) => { println!("Error: {:?}", e); panic!("Scheiss"); } // }; let resp = if head { cb.head(l).send() } else { cb.get(l).send() }; match resp { Ok(s) => { if s.status().is_informational() || s.status().is_success() || s.status().is_redirection() { return UrlStatus::UrlOk; } if head { check_link_inner(l, false) } else { let e = format!("{}: {}", l, s.status()); UrlStatus::UrlError(e) } } Err(e) => { let e = format!("{}", e); UrlStatus::UrlError(e) } } } fn print_ok(no_colors: bool, url: &str, f: &str) { if no_colors { info!("✔ {} in {}", &url, f); } else { // println!("✔ {} in {}", &url, f); info!("{} {} in {}", "✔".bright_green().bold(), url, f); } }