defined( 'WPINC' ) || exit();
* Handles sitemap crawling, blacklisting, and async operations.
class Crawler extends Root {
const LOG_TAG = 'πΈοΈ';
const TYPE_REFRESH_MAP = 'refresh_map';
const TYPE_EMPTY = 'empty';
const TYPE_BLACKLIST_EMPTY = 'blacklist_empty';
const TYPE_BLACKLIST_DEL = 'blacklist_del';
const TYPE_BLACKLIST_ADD = 'blacklist_add';
const TYPE_START = 'start';
const TYPE_RESET = 'reset';
const USER_AGENT = 'lscache_walker';
const FAST_USER_AGENT = 'lscache_runner';
const STATUS_BLACKLIST = 'B';
const STATUS_NOCACHE = 'N';
private $_sitemeta = 'meta.data';
* Reason that ended current run.
* Server IP set in settings.
private $_crawler_conf = [
* Built crawler variants.
* Current allowed worker threads.
private $_cur_threads = -1;
* Max timestamp to run until.
* Last time threads were adjusted.
private $_cur_thread_time;
* Map-status list to batch-save.
private $_map_status_list = [
* Initialize crawler, assign sitemap path.
public function __construct() {
$this->_sitemeta = 'meta' . get_current_blog_id() . '.data';
$this->_resetfile = LITESPEED_STATIC_DIR . '/crawler/' . $this->_sitemeta . '.reset';
$this->_summary = self::get_summary();
$this->_ncpu = $this->_get_server_cpu();
$this->_server_ip = $this->conf( Base::O_SERVER_IP );
self::debug( 'Init w/ CPU cores=' . $this->_ncpu );
* @return int Number of cores detected.
private function _get_server_cpu() {
$cpuinfo_file = '/proc/cpuinfo';
$setting_open_dir = ini_get( 'open_basedir' );
if ( $setting_open_dir ) {
return 1; // Server has limit.
// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
if (!@is_file($cpuinfo_file)) {
} catch ( \Exception $e ) {
// Local system read; no WP alternative. Suppress sniff.
// phpcs:ignore WordPress.WP.AlternativeFunctions.file_get_contents_file_get_contents
$cpuinfo = file_get_contents( $cpuinfo_file );
preg_match_all( '/^processor/m', $cpuinfo, $matches );
$cnt = isset( $matches[0] ) ? count( $matches[0] ) : 0;
* Check whether the current crawler is active.
* @param int $curr Crawler index.
* @return bool Active state.
public function is_active( $curr ) {
$bypass_list = self::get_option( 'bypass_list', [] );
return ! in_array( (int) $curr, $bypass_list, true );
* Toggle the current crawler's active state and return the updated state.
* @param int $curr Crawler index.
* @return bool True if turned on, false if turned off.
public function toggle_activeness( $curr ) {
$bypass_list = self::get_option( 'bypass_list', [] );
if ( in_array( (int) $curr, $bypass_list, true ) ) {
$key = array_search( (int) $curr, $bypass_list, true );
unset( $bypass_list[ $key ] );
$bypass_list = array_values( $bypass_list );
self::update_option( 'bypass_list', $bypass_list );
$bypass_list[] = (int) $curr;
self::update_option( 'bypass_list', $bypass_list );
public function clear_disabled_list() {
self::update_option( 'bypass_list', [] );
$msg = __( 'Crawler disabled list is cleared! All crawlers are set to active! ', 'litespeed-cache' );
Admin_Display::note( $msg );
self::debug( 'All crawlers are set to active...... ' );
* Overwrite get_summary to init elements.
* @param string|false $field Field name to fetch or false to get all.
* @return mixed Summary value/array or null if not found.
public static function get_summary( $field = false ) {
'curr_crawler_beginning_time' => 0,
'this_full_beginning_time' => 0,
'last_full_time_cost' => 0,
'last_crawler_total_cost' => 0,
'crawler_stats' => [], // this will store all crawlers hit/miss crawl status.
wp_cache_delete( 'alloptions', 'options' ); // ensure the summary is current.
$summary = parent::get_summary();
$summary = array_merge( $_default, $summary );
if ( false === $field ) {
if ( array_key_exists( $field, $summary ) ) {
return $summary[ $field ];
* Overwrite save_summary.
* @param array|false $data Data to save or false to save current.
* @param bool $reload Whether to reload after saving.
* @param bool $overwrite Whether to overwrite completely.
public static function save_summary( $data = false, $reload = false, $overwrite = false ) {
$instance->_summary['meta_save_time'] = time();
$data = $instance->_summary;
parent::save_summary( $data, $reload, $overwrite );
File::save( LITESPEED_STATIC_DIR . '/crawler/' . $instance->_sitemeta, wp_json_encode( $data ), true );
* Cron start async crawling.
public static function start_async_cron() {
Task::async_call( 'crawler' );
* Manually start async crawling.
public static function start_async() {
Task::async_call( 'crawler_force' );
$msg = __( 'Started async crawling', 'litespeed-cache' );
Admin_Display::success( $msg );
* @param bool $manually_run Whether manually triggered.
public static function async_handler( $manually_run = false ) {
self::debug( '------------async-------------start_async_handler' );
self::start( (bool) $manually_run );
* @param bool $manually_run Whether manually triggered.
public static function start( $manually_run = false ) {
if ( ! Router::can_crawl() ) {
self::debug( '......crawler is NOT allowed by the server admin......' );
self::debug( '......crawler manually ran......' );
self::cls()->_crawl_data( (bool) $manually_run );
* @param bool $manually_run Whether manually triggered.
private function _crawl_data( $manually_run ) {
if ( ! defined( 'LITESPEED_LANE_HASH' ) ) {
define( 'LITESPEED_LANE_HASH', Str::rrand( 8 ) );
if ( $this->_check_valid_lane() ) {
$this->_take_over_lane();
self::debug( 'β οΈ lane in use' );
self::debug( '......crawler started......' );
// for the first time running.
if ( ! $this->_summary || ! Data::cls()->tb_exist( 'crawler' ) || ! Data::cls()->tb_exist( 'crawler_blacklist' ) ) {
$this->cls( 'Crawler_Map' )->gen();
// if finished last time, regenerate sitemap.
if ( 'touchedEnd' === $this->_summary['done'] ) {
// check whole crawling interval.
$last_finished_at = (int) $this->_summary['last_full_time_cost'] + (int) $this->_summary['this_full_beginning_time'];
if ( ! $manually_run && ( time() - $last_finished_at ) < $this->conf( Base::O_CRAWLER_CRAWL_INTERVAL ) ) {
self::debug( 'Cron abort: cache warmed already.' );
self::debug( 'TouchedEnd. regenerate sitemap....' );
$this->cls( 'Crawler_Map' )->gen();
$crawlers = $this->list_crawlers();
$crawlers_count = count( $crawlers );
// Skip the crawlers that in bypassed list.
while ( ! $this->is_active( $this->_summary['curr_crawler'] ) && $this->_summary['curr_crawler'] < $crawlers_count ) {
self::debug( 'Skipped the Crawler #' . $this->_summary['curr_crawler'] . ' ......' );
$this->_summary['curr_crawler'] = (int) $this->_summary['curr_crawler'] + 1;
if ( $this->_summary['curr_crawler'] >= $crawlers_count ) {
$this->_end_reason = 'end';
$this->_terminate_running();
// In case crawlers are all done but not reload, reload it.
if ( empty( $this->_summary['curr_crawler'] ) || empty( $this->_crawlers[ $this->_summary['curr_crawler'] ] ) ) {
$this->_summary['curr_crawler'] = 0;
$this->_summary['crawler_stats'][ $this->_summary['curr_crawler'] ] = [];
$res = $this->load_conf();
self::debug( 'Load conf failed' );
$this->_terminate_running();
} catch ( \Exception $e ) {
self::debug( 'π ' . $e->getMessage() );
* Load conf before running crawler.
* @return bool True on success.
private function load_conf() {
$this->_crawler_conf['base'] = site_url();
$current_crawler = $this->_crawlers[ $this->_summary['curr_crawler'] ];
foreach ( $current_crawler as $k => $v ) {
if ( 0 !== strpos( $k, 'cookie:' ) ) {
$this->_crawler_conf['cookies'][ substr( $k, 7 ) ] = $v;
if ( ! empty( $current_crawler['webp'] ) ) {
$this->_crawler_conf['headers'][] = 'Accept: image/' . ( 2 === (int) $this->conf( Base::O_IMG_OPTM_WEBP ) ? 'avif' : 'webp' ) . ',*/*';
if ( ! empty( $current_crawler['mobile'] ) ) {
$this->_crawler_conf['ua'] = 'Mobile iPhone';
// Limit delay to use server setting.
$this->_crawler_conf['run_delay'] = 500; // microseconds.
if ( defined( 'LITESPEED_CRAWLER_USLEEP' ) && constant( 'LITESPEED_CRAWLER_USLEEP' ) > $this->_crawler_conf['run_delay'] ) {
$this->_crawler_conf['run_delay'] = (int) constant( 'LITESPEED_CRAWLER_USLEEP' );
if ( isset( $_SERVER[ Base::ENV_CRAWLER_USLEEP ] ) ) {
$env_usleep = absint( wp_unslash( $_SERVER[ Base::ENV_CRAWLER_USLEEP ] ) );
if ( $env_usleep > (int) $this->_crawler_conf['run_delay'] ) {
$this->_crawler_conf['run_delay'] = $env_usleep;
$this->_crawler_conf['run_duration'] = $this->get_crawler_duration();
$this->_crawler_conf['load_limit'] = (int) $this->conf( Base::O_CRAWLER_LOAD_LIMIT );
if ( isset( $_SERVER[ Base::ENV_CRAWLER_LOAD_LIMIT_ENFORCE ] ) ) {
$this->_crawler_conf['load_limit'] = absint( wp_unslash( $_SERVER[ Base::ENV_CRAWLER_LOAD_LIMIT_ENFORCE ] ) );
} elseif ( isset( $_SERVER[ Base::ENV_CRAWLER_LOAD_LIMIT ] ) ) {
$env_limit = absint( wp_unslash( $_SERVER[ Base::ENV_CRAWLER_LOAD_LIMIT ] ) );
if ( $env_limit < (int) $this->_crawler_conf['load_limit'] ) {
$this->_crawler_conf['load_limit'] = $env_limit;
if ( 0 === (int) $this->_crawler_conf['load_limit'] ) {
self::debug( 'π Terminated crawler due to load limit set to 0' );