exec("CREATE TABLE IF NOT EXISTS sentences (id INTEGER PRIMARY KEY, sentence TEXT, scrape_count INTEGER DEFAULT 1, listen_count INTEGER DEFAULT 0)"); //$db->exec("CREATE TABLE IF NOT EXISTS queue (id INTEGER PRIMARY KEY, sentence_id INTEGER, is_read INTEGER DEFAULT 0, ip_address TEXT)"); $db->exec("PRAGMA journal_mode=WAL"); $db->exec("PRAGMA synchronous=NORMAL"); } // Array of websites to scrape $websites = array( "https://news.ycombinator.com/newest", "https://www.thecoli.com/whats-new/latest-activity", "https://www.lipstickalley.com/whats-new/latest-activity", "https://www.reddit.com/new/", "https://slashdot.org/recent", "https://fark.com", "https://lobste.rs/recent", "https://9gag.com/fresh" ); //lynx -dump -accept_all_cookies -nolist https://news.ycombinator.com/newest // Create the database schema if it doesn't exist create_schema($db); // Loop through each website foreach ($websites as $website) { // Use w3m to output the contents of the website // w3m is a text-based web browser //$output = shell_exec("w3m -dump $website"); //ignore ssl certificate errors //hide links //dump output to stdout using lynx $output = $output = shell_exec("lynx -dump -accept_all_cookies -nolist $website"); // Split the output into an array of sentences //split by new line $sentences = preg_split('/\n/', $output); //$sentences = preg_split('/(?<=[.!?\n])\s+(?=[a-z])/i', $output); //what does the above regular expression do? //it splits the output into an array of sentences //(?<=[.!?\n])\s+(?=[a-z]) this means what //The (?<=[.!?\n]) part is a positive lookbehind assertion, //which matches a position that is immediately preceded by a period (.), //exclamation point (!), or question mark (?) followed by a newline (\n). //The \s+ part matches one or more whitespace characters. //The (?=[a-z]) part is a positive lookahead assertion, which matches a //position that is immediately followed by an lowercase letter (a-z). // Loop through each sentence foreach ($sentences as $sentence) { // Check if the sentence already exists in the database $stmt = $db->prepare("SELECT * FROM sentences WHERE sentence=:sentence"); $stmt->bindValue(':sentence', $sentence); $result = $stmt->execute(); // If the sentence does not exist, insert it into the database if ($result->fetchArray() === false) { $stmt_insert = $db->prepare("INSERT INTO sentences (sentence) VALUES (:sentence)"); $stmt_insert->bindValue(':sentence', $sentence); $stmt_insert->execute(); // Add the sentence to the queue to be read later //$stmt_insert_into_queue = $db->prepare("INSERT INTO queue (sentence_id) SELECT id FROM sentences WHERE sentence=:sentence"); //$stmt_insert_into_queue->bindValue(':sentence', $sentence); //$stmt_insert_into_queue->execute(); } else { //update view count $stmt_update = $db->prepare("UPDATE sentences SET scrape_count = scrape_count + 1 WHERE sentence=:sentence"); $stmt_update->bindValue(':sentence', $sentence); $stmt_update->execute(); } } } // Close the database connection $db->close(); ?>