restore_articles - Action News

/articles/* * * Real-time progress: * - Disables output buffering; flushes after each step. */ini_set('display_errors', 1);error_reporting(E_ALL);date_default_timezone_set('UTC');/*** CONFIG ***/$BASE = '/var/wwwroot/westnet.ca';$RSS_FILE = $BASE . '/rss.xml';$ARTICLES_DIR = $BASE . '/articles';/*** INPUT ***/$SAVE = isset($_GET['save']) && $_GET['save'] == '1';$ONLY_MISSING = isset($_GET['only_missing']) && $_GET['only_missing'] == '1';$LIMIT = isset($_GET['limit']) ? max(1, intval($_GET['limit'])) : 10000;$MEDIA = isset($_GET['media']) && $_GET['media'] == '1';$FORCE = isset($_GET['force']) && $_GET['force'] == '1';$VERBOSE = isset($_GET['verbose']) && $_GET['verbose'] == '1';$FROM = isset($_GET['from']) ? strtotime($_GET['from'] . ' 00:00:00 UTC') : null;$TO = isset($_GET['to']) ? strtotime($_GET['to'] . ' 23:59:59 UTC') : null;$NAME_MODE = isset($_GET['name']) ? $_GET['name'] : 'slug'; // slug | date | datedslug$EXT = (isset($_GET['ext']) && in_array($_GET['ext'], array('htm','txt'))) ? $_GET['ext'] : 'htm';$ALIAS = isset($_GET['alias']) && $_GET['alias'] == '1';/*** OUTPUT MODE ***/if (php_sapi_name() !== 'cli') { header('Content-Type: text/plain; charset=UTF-8'); header('X-Accel-Buffering: no'); // nginx}@ini_set('zlib.output_compression', '0');@ini_set('output_buffering', '0');while (@ob_get_level()) { @ob_end_flush(); }ob_implicit_flush(true);/*** UTILS ***/function out($s=''){ echo $s . "\n"; @flush(); }function vv($s){ global $VERBOSE; if ($VERBOSE) out($s); }function ensure_dir($d){ if(!is_dir($d)) @mkdir($d,0755,true); return is_dir($d); }function http_get($url, $timeout=25){ if (function_exists('curl_init')) { $ch = curl_init($url); curl_setopt_array($ch, array( CURLOPT_RETURNTRANSFER => true, CURLOPT_FOLLOWLOCATION => true, CURLOPT_CONNECTTIMEOUT => $timeout, CURLOPT_TIMEOUT => $timeout, CURLOPT_USERAGENT => 'WestNet-RestoreBot/2.0 (PHP 5.6)' )); $body = curl_exec($ch); $code = curl_getinfo($ch, CURLINFO_HTTP_CODE); $err = curl_error($ch); curl_close($ch); return array($code, $body, $err); } else { $ctx = stream_context_create(array('http'=>array('method'=>'GET','timeout'=>$timeout,'header'=>"User-Agent: WestNet-RestoreBot/2.0\r\n"))); $body = @file_get_contents($url,false,$ctx); $code = 200; return array($body!==false?200:0, $body, $body!==false?null:'file_get_contents failed'); }}function clean_xml($s){ // strip BOM + control chars not allowed in XML 1.0 $s = preg_replace('/^\xEF\xBB\xBF/', '', $s); $s = preg_replace('/[^\x09\x0A\x0D\x20-\x{D7FF}\x{E000}-\x{FFFD}]/u', '', $s); return $s;}function parse_feed_items($xmlString){ $items = array(); if (!$xmlString) return $items; libxml_use_internal_errors(true); $xmlString = clean_xml($xmlString); $xml = @simplexml_load_string($xmlString); if (!$xml) return $items; // Try RSS first if (isset($xml->channel->item)) { foreach ($xml->channel->item as $it) { $items[] = array( 'title' => trim((string)$it->title), 'link' => trim((string)$it->link ? (string)$it->link : (string)$it->guid), 'date' => trim((string)$it->pubDate) ); } return $items; } // Atom (entry) if (isset($xml->entry)) { foreach ($xml->entry as $en) { $link = ''; if (isset($en->link)) { // pick rel="alternate" href if present foreach ($en->link as $ln) { $attrs = $ln->attributes(); if (!isset($attrs['rel']) || (string)$attrs['rel'] === 'alternate') { $link = (string)$attrs['href']; break; } } } $items[] = array( 'title' => trim((string)$en->title), 'link' => trim($link), 'date' => trim((string)$en->updated ? (string)$en->updated : (string)$en->published) ); } return $items; } // Fallback: XPath all //item or //entry $dom = new DOMDocument(); if (@$dom->loadXML($xmlString)) { $xp = new DOMXPath($dom); foreach ($xp->query('//item') as $node) { $title = $xp->evaluate('string(title)', $node); $link = $xp->evaluate('string(link)', $node); $guid = $xp->evaluate('string(guid)', $node); $date = $xp->evaluate('string(pubDate)', $node); $items[] = array( 'title' => trim($title), 'link' => trim($link ? $link : $guid), 'date' => trim($date) ); } if ($items) return $items; foreach ($xp->query('//*[local-name()="entry"]') as $node) { $title = ''; $t = $xp->evaluate('string(./*[local-name()="title"])', $node); if ($t) $title = $t; $link = ''; foreach ($xp->query('./*[local-name()="link"]', $node) as $ln) { $rel = $ln->attributes->getNamedItem('rel'); if (!$rel || $rel->nodeValue === 'alternate') { $href = $ln->attributes->getNamedItem('href'); if ($href) { $link = $href->nodeValue; break; } } } $date = $xp->evaluate('string(./*[local-name()="updated"])', $node); if (!$date) $date = $xp->evaluate('string(./*[local-name()="published"])', $node); $items[] = array('title'=>trim($title),'link'=>trim($link),'date'=>trim($date)); } } return $items;}function parse_slug_from_link($url){ // Accept any domain; get ?article=SLUG $p = @parse_url($url); if (!$p || empty($p['query'])) return null; parse_str($p['query'], $q); if (!isset($q['article'])) return null; $slug = urldecode($q['article']); $slug = preg_replace('/[^\w\-\.\%]/u', '_', $slug); return trim($slug);}function pick_wayback_snapshot($absoluteUrl, $ts){ // Scrape listing; choose closest timestamp; prefer id_ variant to hide toolbar. $index = "https://web.archive.org/web/*/" . rawurlencode($absoluteUrl); list($code, $html,) = http_get($index, 25); if ($code!=200 || !$html) return null; if (!preg_match_all('#/web/(\d{14})/https?://#i', $html, $m)) return null; $best = null; $bestdiff = PHP_INT_MAX; foreach (array_unique($m[1]) as $stamp){ $dt = DateTime::createFromFormat('YmdHis', $stamp, new DateTimeZone('UTC')); if(!$dt) continue; $t = (int)$dt->getTimestamp(); $diff = $ts ? abs($t - $ts) : (PHP_INT_MAX - $t); if ($diff < $bestdiff){ $bestdiff=$diff; $best=$stamp; } } if (!$best) return null; // Use id_ to suppress toolbar (still strip as a fallback) return "https://web.archive.org/web/{$best}id_/" . $absoluteUrl;}function strip_wayback_and_rewrite($html){ // Remove Wayback toolbar chunk (documented between comment markers) // and their injected scripts; then unwrap archived /articles paths back to local. $html = preg_replace('#.*?#is', '', $html); $html = preg_replace('##is', '', $html); // Rewrite archived /articles/* /articles/* $html = preg_replace('#https?://web\.archive\.org/web/\d+(?:id_)?/https?://(?:action\.news|www\.wnactionnews\.com)/articles/#i', '/articles/', $html); // Some snapshots use start.westnet.ca keep local path if it is under /articles/ $html = preg_replace('#https?://web\.archive\.org/web/\d+(?:id_)?/https?://start\.westnet\.ca/articles/#i', '/articles/', $html); return $html;}function find_media($html){ $found = array(); // src / poster if (preg_match_all('#\b(?:src|poster)=["\']([^"\']+)["\']#i', $html, $m)) { foreach ($m[1] as $u){ if (preg_match('#^/articles/.+\.(jpe?g|png|gif|webp|mp4)$#i', $u)) $found[$u]=1; } } // srcset if (preg_match_all('#\bsrcset=["\']([^"\']+)["\']#i', $html, $m)) { foreach ($m[1] as $set){ foreach (preg_split('/\s*,\s*/', $set) as $chunk){ $u = trim(preg_split('/\s+/', $chunk, 2)[0]); if (preg_match('#^/articles/.+\.(jpe?g|png|gif|webp)$#i', $u)) $found[$u]=1; } } } return array_keys($found);}function fetch_and_optionally_save($srcUrl, $destPath, $save){ list($code, $body, $err) = http_get($srcUrl, 25); if ($code==200 && $body!=='') { if ($save) { ensure_dir(dirname($destPath)); file_put_contents($destPath, $body); } return array(true, strlen($body), null); } return array(false, 0, $err ? $err : "HTTP $code");}function build_target_name($slug, $pubTs, $mode, $ext){ $date = $pubTs ? gmdate('Y-m-d', $pubTs) : null; if ($mode === 'date' && $date) return $date . '.' . $ext; if ($mode === 'datedslug' && $date) return $date . '_' . $slug . '.' . $ext; return $slug . '.' . $ext; // slug}/*** LOAD FEED ***/$feedSrc = null;if (file_exists($RSS_FILE)) { $feedSrc = @file_get_contents($RSS_FILE); if ($feedSrc) vv("Loaded local RSS: $RSS_FILE (".strlen($feedSrc)." bytes)");}if (!$feedSrc) { // fallback to remote, in case local file missing/corrupted list($code,$body,$err) = http_get('https://action.news/rss.xml', 25); if ($code==200 && $body) { $feedSrc=$body; vv("Loaded remote RSS (https://action.news/rss.xml)"); }}$items = parse_feed_items($feedSrc);/*** EXIT EARLY IF FEED EMPTY ***/if (!$items) { out("ERROR: Could not parse RSS/Atom items from local or remote feed."); out("Tip: ensure $RSS_FILE is valid XML or that https://action.news/rss.xml is reachable."); exit(1);}/*** PROCESS ***/$start = microtime(true);out("WestNet Restorer (DRY-RUN=" . ($SAVE?'NO (saving ON)':'YES') . ") PHP " . PHP_VERSION);out("Filters: only_missing=".($ONLY_MISSING?'1':'0')." media=".($MEDIA?'1':'0')." force=".($FORCE?'1':'0')." name=$NAME_MODE ext=$EXT");out("Range: from=".($FROM?gmdate('c',$FROM):'-')." to=".($TO?gmdate('c',$TO):'-')." limit=".$LIMIT);out(str_repeat('-', 78));$done=0; $written=0; $skipped=0; $failed=0; $mediaCount=0;foreach ($items as $i => $it) { if ($done >= $LIMIT) break; $title = trim($it['title']); $link = trim($it['link']); $dateS = trim($it['date']); $pubTs = $dateS ? strtotime($dateS) : null; if ($FROM && $pubTs && $pubTs < $FROM) continue; if ($TO && $pubTs && $pubTs > $TO) continue; $slug = parse_slug_from_link($link); if (!$slug) { // fallback: derive from title $slug = preg_replace('/[^\w\-]+/u', '_', mb_strtolower(trim($title), 'UTF-8')); $slug = trim($slug, '_'); if (!$slug) $slug = 'untitled_' . ($i+1); } $targetName = build_target_name($slug, $pubTs, $NAME_MODE, $EXT); $targetPath = $ARTICLES_DIR . '/' . $targetName; $exists = (file_exists($ARTICLES_DIR.'/'.$slug.'.htm') || file_exists($ARTICLES_DIR.'/'.$slug.'.txt') || file_exists($targetPath)); if ($ONLY_MISSING && $exists && !$FORCE) { $skipped++; vv("[skip-exists] $slug"); continue; } $done++; out("#{$done} {$title}"); out(" slug: {$slug}"); out(" date: " . ($pubTs?gmdate('Y-m-d H:i:s \U\T\C', $pubTs):'n/a')); out(" link: {$link}"); out(" out : /articles/{$targetName} (exists? ".($exists?'YES':'NO').")"); // Canonical URL to restore $canonical = "https://action.news/westnetnews.php?article=" . rawurlencode($slug); $wb = pick_wayback_snapshot($canonical, $pubTs); if (!$wb) { $failed++; out(" -> FAIL: no Wayback snapshot"); out(''); continue; } out(" wb : {$wb}"); list($code, $html, $err) = http_get($wb, 25); if ($code!=200 || !$html) { $failed++; out(" -> FAIL: fetch error: ".($err?:("HTTP ".$code))); out(''); continue; } // Extract the meaningful body: prefer your story container if present, else entire $html = @mb_convert_encoding($html, 'UTF-8', 'UTF-8, ISO-8859-1, Windows-1252'); $clean = strip_wayback_and_rewrite($html); // take inner body if possible $inner = $clean; if (preg_match('#]+id=["\']story_content["\'][^>]*>(.*?)

#is', $clean, $m)) { $inner = $m[1]; } elseif (preg_match('#]+class=["\']article-content["\'][^>]*>(.*?)

Related Articles

Share this page