fa4aa9a0fd

bf6319839e

8198287ec0

61deefb75b
This commit is contained in:
Bakhai Support 2025-09-29 16:59:10 +05:30
commit af4bfbd0cd

View file

@ -355,6 +355,36 @@ class ddg{
public function web($get){
if($get["npt"]){
[$raw_data, $proxy] = $this->backend->get($get["npt"], "web");
$raw_data = explode(",", $raw_data, 2);
if($raw_data[0] == "0"){
return $this->web_html($get, [$raw_data[1], $proxy]);
}
return $this->web_full($get, [$raw_data[1], $proxy]);
}else{
// we have $get["s"]
if(
strpos($get["s"], "\"") !== false || // contains quotes
strpos($get["s"], ":") !== false // contains potential site: operator or whatever the fuck
){
return $this->web_html($get);
}
// no quotes sent, do full web search
return $this->web_full($get);
}
}
public function web_html($get, $npt = null){
$out = [
"status" => "ok",
"spelling" => [
@ -371,9 +401,368 @@ class ddg{
"related" => []
];
if($get["npt"]){
if($npt !== null){
[$js_link, $proxy] = $this->backend->get($get["npt"], "web");
[$get_filters, $proxy] = $npt;
$get_filters = json_decode($get_filters, true);
}else{
if(strlen($get["s"]) === 0){
throw new Exception("Search term is empty!");
}
$proxy = $this->backend->get_ip();
// generate filters
$get_filters = [
"q" => $get["s"]
];
if($get["country"] == "any"){
$get_filters["kl"] = "wt-wt";
}else{
$get_filters["kl"] = $get["country"];
}
switch($get["nsfw"]){
case "yes": $get_filters["kp"] = "-2"; break;
case "maybe": $get_filters["kp"] = "-1"; break;
case "no": $get_filters["kp"] = "1"; break;
}
$df = true;
if($get["newer"] === false){
if($get["older"] !== false){
$start = 36000;
$end = $get["older"];
}else{
$df = false;
}
}else{
$start = $get["newer"];
if($get["older"] !== false){
$end = $get["older"];
}else{
$end = time();
}
}
if($df === true){
$get_filters["df"] = date("Y-m-d", $start) . ".." . date("Y-m-d", $end);
}
}
//
// Get HTML
//
try{
$html = $this->get(
$proxy,
"https://html.duckduckgo.com/html/",
$get_filters
);
}catch(Exception $e){
throw new Exception("Failed to fetch search page");
}
//$html = file_get_contents("scraper/ddg.html");
$this->fuckhtml->load($html);
//
// Get next page token
//
$forms =
$this->fuckhtml
->getElementsByTagName(
"form"
);
foreach(array_reverse($forms) as $form){
$this->fuckhtml->load($form);
$input_probe =
$this->fuckhtml
->getElementsByClassName(
"btn--alt",
"input"
);
if(count($input_probe) !== 0){
// found next page!
$inputs =
$this->fuckhtml
->getElementsByAttributeValue(
"type",
"hidden",
"input"
);
$query = [];
foreach($inputs as $q){
$query[
$this->fuckhtml
->getTextContent(
$q["attributes"]["name"]
)
] =
$this->fuckhtml
->getTextContent(
$q["attributes"]["value"]
);
}
$out["npt"] =
$this->backend->store(
"0," . json_encode($query),
"web",
$proxy
);
break;
}
}
// reset
$this->fuckhtml->load($html);
//
// parse wikipedia answer
//
$wiki_wrapper =
$this->fuckhtml
->getElementsByClassName(
"zci-wrapper",
"div"
);
if(count($wiki_wrapper) !== 0){
$this->fuckhtml->load($wiki_wrapper[0]);
$a =
$this->fuckhtml
->getElementsByTagName(
"a"
);
if(count($a) !== 0){
$link =
$this->unshiturl(
$this->fuckhtml
->getTextContent(
$a[0]["attributes"]["href"]
)
);
}else{
$link = null;
}
$title =
$this->fuckhtml
->getElementsByTagName(
"h1"
);
if(count($title) !== 0){
$title =
$this->fuckhtml
->getTextContent(
$title[0]
);
}else{
$title = null;
}
$description =
$this->fuckhtml
->getElementById(
"zero_click_abstract",
"div"
);
if($description !== false){
$this->fuckhtml->load($description);
$thumb =
$this->fuckhtml
->getElementsByTagName(
"img"
);
if(count($thumb) !== 0){
$thumb =
$this->fuckhtml
->getTextContent(
$thumb[0]["attributes"]["src"]
);
}else{
$thumb = null;
}
$as =
$this->fuckhtml
->getElementsByTagName(
"a"
);
foreach($as as $a){
$description["innerHTML"] =
str_replace(
$a["outerHTML"],
"",
$description["innerHTML"]
);
}
$description =
$this->fuckhtml
->getTextContent(
$description
);
$out["answer"][] = [
"title" => $title,
"description" => [
[
"type" => "text",
"value" => $description
]
],
"url" => $link,
"thumb" => $thumb,
"table" => [],
"sublink" => []
];
}
// reset
$this->fuckhtml->load($html);
}
//
// Get results
//
$results =
$this->fuckhtml
->getElementsByClassName(
"result",
"div"
);
foreach($results as $result){
$this->fuckhtml->load($result);
$title =
$this->fuckhtml
->getElementsByTagName(
"h2"
);
if(count($title) === 0){
// should not happen
continue;
}
$title =
$this->fuckhtml
->getTextContent(
$title[0]
);
$description_obj =
$this->fuckhtml
->getElementsByClassName(
"result__snippet",
"a"
);
if(count($description_obj) === 0){
$description = null;
}else{
$description =
$this->titledots(
$this->fuckhtml
->getTextContent(
$description_obj[0]
)
);
}
$url =
$this->fuckhtml
->getTextContent(
$description_obj[0]["attributes"]["href"]
);
$out["web"][] = [
"title" => $this->titledots($title),
"description" => $description,
"url" => $this->unshiturl($url),
"date" => null,
"type" => "web",
"thumb" => [
"ratio" => null,
"url" => null
],
"sublink" => [],
"table" => []
];
}
return $out;
}
public function web_full($get, $npt = null){
$out = [
"status" => "ok",
"spelling" => [
"type" => "no_correction",
"using" => null,
"correction" => null
],
"npt" => null,
"answer" => [],
"web" => [],
"image" => [],
"video" => [],
"news" => [],
"related" => []
];
if($npt !== null){
[$js_link, $proxy] = $npt;
$js_link = "https://links.duckduckgo.com" . $js_link;
$html = "";
@ -490,6 +879,7 @@ class ddg{
throw new Exception("Failed to fetch d.js");
}
//$js = file_get_contents("scraper/fuck.js");
//echo htmlspecialchars($js);
$js_tmp =
@ -501,6 +891,139 @@ class ddg{
if(count($js_tmp) <= 1){
//
// Detect javascript challenge
//
if(
preg_match(
'/DDG\.deep\.initialize\(\'([^\']+)\'\ *\+ *jsa/i',
$js,
$challenge_url
)
){
throw new Exception("DuckDuckGo returned a JSA challenge");
// get JSA initial token
if(
!preg_match(
'/let jsa *= *([0-9]+)/',
$js,
$jsa
)
){
$jsa = 0;
}else{
$jsa = (int)$jsa[1];
}
// get function bodies
preg_match_all(
'/let *([A-Za-z0-9]+) *= *function\(.*\) *{(.*)};/sU',
$js,
$functions
);
$parsed_functions = [];
for($i=0; $i<count($functions[0]); $i++){
$functions[2][$i] = trim($functions[2][$i]);
if(
preg_match(
'/return num *\* *([0-9]+)/i',
$functions[2][$i],
$num
)
){
$parsed_functions[$functions[1][$i]] = [
"type" => "multiplication",
"num" => (int)$num[1]
];
continue;
}
if(
preg_match(
'/innerHTML *= *`([^`]+)`/i',
$functions[2][$i],
$challenge
)
){
$challenge[1] =
preg_replace(
'/<\/(br)>/',
'<$1>',
$challenge[1]
);
$parsed_functions[$functions[1][$i]] = [
"type" => "challenge",
"text" => $challenge[1]
];
}
}
// get function call order
preg_match_all(
'/jsa *= *([A-Za-z0-9]+)\(jsa\)/i',
$js,
$call_order
);
foreach($call_order[1] as $order){
if(!isset($parsed_functions[$order])){
throw new Exception("JS challenge solve failure: DuckDuckGo called an unknown function");
}
if($parsed_functions[$order]["type"] == "multiplication"){
$jsa = $jsa * $parsed_functions[$order]["num"];
continue;
}
if($parsed_functions[$order]["type"] == "challenge"){
// @TODO get parsed length
//$parsed_functions[$order]["text"]
$jsa = $jsa + strlen($parsed_functions[$order]["text"]);
}
}
try{
$js = $this->get(
$proxy,
"https://links.duckduckgo.com" . $challenge_url[1] . $jsa,
[],
ddg::req_xhr
);
}catch(Exception $error){
throw new Exception("Failed to get challenged d.js");
}
}
//
// Detect JavaScript anomaly failure thingy
//
if(
preg_match(
'/DDG.deep.anomalyDetectionBlock\({/',
$js
)
){
throw new Exception("DuckDuckGo detected an anomaly in the Javascript challenge response");
}
throw new Exception("Failed to grep pageLayout(d)");
}
@ -678,7 +1201,7 @@ class ddg{
// get NPT
$out["npt"] =
$this->backend->store(
$item["n"],
"1," . $item["n"],
"web",
$proxy
);
@ -2065,7 +2588,7 @@ class ddg{
$start = $tag["endPos"];
}
// stuff out remainder
// shit out remainder
$description[] = [
"type" => "text",
"value" =>
@ -2129,10 +2652,24 @@ class ddg{
private function unshiturl($url){
// check for domains w/out first short subdomain (ex: www.)
// remove tracking redirect
// yes, the privacy search engine has click-out tracking. great!
$domain = parse_url($url, PHP_URL_HOST);
if($domain == "duckduckgo.com"){
$query = parse_url($url, PHP_URL_QUERY);
parse_str($query, $query);
if(isset($query["uddg"])){
$url = $query["uddg"];
$domain = parse_url($url, PHP_URL_HOST);
}
}
// check for domains w/out first short subdomain (ex: www.)
$subdomain = preg_replace(
'/^[A-z0-9]{1,3}\./',
"",
@ -2246,4 +2783,4 @@ class ddg{
floor($height * $ratio)
];
}
}
}