Synced
fix(google): Use updated Google search endpoint via unixfox's research for SearXNG (b531406ce3/engines/text/google.php) Add initial Bing support Note, Bing obfuscates anchor links but it was trivial to determine. (f9f51c6b74) Update bing.php, decode URL Added urldecode to base64uri-decoded URL to enable proper parsing by get_base_url. (97c085cf95) Ignore Bing's relative links Updated parser for Bing links to ignore links that don't fit "results", such as relative links. Only non-obfuscated links and de-obfuscated absolute links will correctly make it through the parser. (7f12ad2950) https://github.com/Ahwxorg/LibreY/pull/218 https://github.com/Ahwxorg/LibreY/pull/215
This commit is contained in:
		
					parent
					
						
							
								7be73c5967
							
						
					
				
			
			
				commit
				
					
						747e827fef
					
				
			
		
					 3 changed files with 125 additions and 7 deletions
				
			
		
							
								
								
									
										92
									
								
								engines/text/bing.php
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										92
									
								
								engines/text/bing.php
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,92 @@
 | 
			
		|||
<?php
 | 
			
		||||
    class BingSearchRequest extends EngineRequest {
 | 
			
		||||
        public function get_request_url() {
 | 
			
		||||
            $query_encoded = str_replace("%22", "\"", urlencode($this->query));
 | 
			
		||||
 | 
			
		||||
            $results_language = $this->opts->language;
 | 
			
		||||
            $number_of_results = $this->opts->number_of_results;
 | 
			
		||||
 | 
			
		||||
            // TODO figure out how to not autocorrect
 | 
			
		||||
            $url = "https://www.bing.com/search?q=$query_encoded&first=" . ((10 * $this->page) + 1);
 | 
			
		||||
 | 
			
		||||
            // TODO language setting
 | 
			
		||||
            if (!is_null($results_language))
 | 
			
		||||
                $url .= "&lang=$results_language";
 | 
			
		||||
 | 
			
		||||
            return $url;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        public function parse_results($response) {
 | 
			
		||||
            $results = array();
 | 
			
		||||
            $xpath = get_xpath($response);
 | 
			
		||||
 | 
			
		||||
            if (!$xpath)
 | 
			
		||||
                return $results;
 | 
			
		||||
 | 
			
		||||
            foreach($xpath->query("//ol[@id='b_results']//li") as $result) {
 | 
			
		||||
                $href_url = $xpath->evaluate(".//h2//a//@href", $result)[0];
 | 
			
		||||
 | 
			
		||||
                if ($href_url == null)
 | 
			
		||||
                    continue;
 | 
			
		||||
 | 
			
		||||
                $possible_url = $href_url->textContent;
 | 
			
		||||
 | 
			
		||||
                $possible_url_query = parse_url($possible_url, PHP_URL_QUERY);
 | 
			
		||||
 | 
			
		||||
                if ($possible_url_query == false)
 | 
			
		||||
                    continue;
 | 
			
		||||
 | 
			
		||||
                parse_str($possible_url_query, $possible_url);
 | 
			
		||||
 | 
			
		||||
                if (!array_key_exists('u', $possible_url))
 | 
			
		||||
                    continue;
 | 
			
		||||
 | 
			
		||||
                $possible_url = $possible_url['u'];
 | 
			
		||||
                
 | 
			
		||||
                if (str_starts_with($possible_url, "a1aHR0c"))
 | 
			
		||||
                {
 | 
			
		||||
                    // First two characters are irrelevant, strip for later
 | 
			
		||||
                    $possible_url = substr($possible_url, 2);
 | 
			
		||||
                }
 | 
			
		||||
                if (str_starts_with($possible_url, "aHR0c"))
 | 
			
		||||
                {
 | 
			
		||||
                    // Base64 "coded", extract and decode
 | 
			
		||||
                    $possible_url = str_replace('-', '+', $possible_url);
 | 
			
		||||
                    $possible_url = str_replace('_', '/', $possible_url);
 | 
			
		||||
                    $url = urldecode(base64_decode($possible_url, true));
 | 
			
		||||
                } else
 | 
			
		||||
                    $url = $possible_url;
 | 
			
		||||
 | 
			
		||||
				if (str_starts_with($url, "a1")) 
 | 
			
		||||
					continue; // It's probably a Bing-relative link such as for video, skip it.
 | 
			
		||||
 | 
			
		||||
                if (!empty($results) && array_key_exists("url", $results) && end($results)["url"] == $url->textContent)
 | 
			
		||||
                    continue;
 | 
			
		||||
 | 
			
		||||
                $title = $xpath->evaluate(".//h2//a", $result)[0];
 | 
			
		||||
 | 
			
		||||
                if ($title == null)
 | 
			
		||||
                    continue;
 | 
			
		||||
 | 
			
		||||
                $title = $title->textContent;
 | 
			
		||||
 | 
			
		||||
                $description = ($xpath->evaluate(".//div[contains(@class, 'b_caption')]//p", $result)[0] ?? null) ?->textContent ?? '';
 | 
			
		||||
 | 
			
		||||
                array_push($results,
 | 
			
		||||
                    array (
 | 
			
		||||
                        "title" => htmlspecialchars($title),
 | 
			
		||||
                        "url" =>  htmlspecialchars($url),
 | 
			
		||||
                        // base_url is to be removed in the future, see #47
 | 
			
		||||
                        "base_url" => htmlspecialchars(get_base_url($url)),
 | 
			
		||||
                        "description" =>  $description == null ?
 | 
			
		||||
                                          TEXTS["result_no_description"] :
 | 
			
		||||
                                          htmlspecialchars($description)
 | 
			
		||||
                    )
 | 
			
		||||
                );
 | 
			
		||||
 | 
			
		||||
            }
 | 
			
		||||
           return $results;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
?>
 | 
			
		||||
| 
						 | 
				
			
			@ -1,6 +1,24 @@
 | 
			
		|||
<?php
 | 
			
		||||
    class GoogleRequest extends EngineRequest {
 | 
			
		||||
        protected string $arc_id;
 | 
			
		||||
        protected int $arc_timestamp = 0;
 | 
			
		||||
 | 
			
		||||
        private function generate_arc_id() {
 | 
			
		||||
            $charset = "01234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_-";
 | 
			
		||||
            $this->arc_id = "srp_";
 | 
			
		||||
 | 
			
		||||
            for ($i = 0; $i < 24; $i++) {
 | 
			
		||||
                $c = random_int(0, strlen($charset) - 1);
 | 
			
		||||
                $this->arc_id .= $charset[$c];
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            $this->arc_id .= "_1";
 | 
			
		||||
            $this->arc_timestamp = time();
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        public function get_request_url() {
 | 
			
		||||
            if ($this->arc_timestamp + 3600 < time())
 | 
			
		||||
                $this->generate_arc_id();
 | 
			
		||||
 | 
			
		||||
            $query_encoded = str_replace("%22", "\"", urlencode($this->query));
 | 
			
		||||
            $results = array();
 | 
			
		||||
| 
						 | 
				
			
			@ -8,6 +26,7 @@
 | 
			
		|||
            $domain = $this->opts->google_domain;
 | 
			
		||||
            $results_language = $this->opts->language;
 | 
			
		||||
            $number_of_results = $this->opts->number_of_results;
 | 
			
		||||
            $arc_page = sprintf("%02d", $this->page * 10);
 | 
			
		||||
 | 
			
		||||
            $url = "https://www.google.$domain/search?q=$query_encoded&nfpr=1&start=$this->page";
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -22,6 +41,8 @@
 | 
			
		|||
            if (isset($_COOKIE["safe_search"]))
 | 
			
		||||
                $url .= "&safe=medium";
 | 
			
		||||
 | 
			
		||||
            $url .= "&asearch=arc&async=arc_id:$this->arc_id$arc_page,use_ac:true,_fmt:html";
 | 
			
		||||
 | 
			
		||||
            return $url;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -33,21 +54,21 @@
 | 
			
		|||
            if (!$xpath)
 | 
			
		||||
                return $results;
 | 
			
		||||
 | 
			
		||||
            $didyoumean = $xpath->query(".//a[@class='gL9Hy']")[0];
 | 
			
		||||
            $didyoumean = $xpath->query(".//p[@class='QRYxYe NNMgCf']/a/b/i")[0];
 | 
			
		||||
 | 
			
		||||
            if (!is_null($didyoumean))
 | 
			
		||||
                array_push($results, array(
 | 
			
		||||
                    "did_you_mean" => $didyoumean->textContent
 | 
			
		||||
                ));
 | 
			
		||||
 | 
			
		||||
            foreach($xpath->query("//div[@id='search']//div[contains(@class, 'g')]") as $result) {
 | 
			
		||||
                $url = $xpath->evaluate(".//div[@class='yuRUbf']//a/@href", $result)[0];
 | 
			
		||||
            foreach($xpath->query("//div[@class='MjjYud']") as $result) {
 | 
			
		||||
                $url = $xpath->evaluate(".//a[@class='zReHs']/@href", $result)[0];
 | 
			
		||||
 | 
			
		||||
                if ($url == null)
 | 
			
		||||
                    continue;
 | 
			
		||||
 | 
			
		||||
                if (!empty($results) && array_key_exists("url", end($results)) && end($results)["url"] == $url->textContent)
 | 
			
		||||
                        continue;
 | 
			
		||||
                    continue;
 | 
			
		||||
 | 
			
		||||
                $url = $url->textContent;
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -76,4 +97,4 @@
 | 
			
		|||
            return $results;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
?>
 | 
			
		||||
?>
 | 
			
		||||
| 
						 | 
				
			
			@ -1,6 +1,6 @@
 | 
			
		|||
<?php
 | 
			
		||||
    function get_engines() {
 | 
			
		||||
        return array("google", "duckduckgo", "brave", "yandex", "ecosia", "mojeek");
 | 
			
		||||
        return array("google", "duckduckgo", "brave", "yandex", "ecosia", "mojeek", "bing");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    class TextSearch extends EngineRequest {
 | 
			
		||||
| 
						 | 
				
			
			@ -88,6 +88,11 @@
 | 
			
		|||
                return new MojeekSearchRequest($opts, $mh);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            if ($engine == "bing") {
 | 
			
		||||
                require_once "engines/text/bing.php";
 | 
			
		||||
                return new BingSearchRequest($opts, $mh);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // if an invalid engine is selected, don't give any results
 | 
			
		||||
            return null;
 | 
			
		||||
        }
 | 
			
		||||
| 
						 | 
				
			
			@ -217,4 +222,4 @@
 | 
			
		|||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
?>
 | 
			
		||||
?>
 | 
			
		||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue