File: /www/wwwroot/healthyton.com/wp-scrap/spinbot_get_gossipgist.php
<?php
require 'vendor/autoload.php';
use Goutte\Client;
function strip_tags_blacklist($html, $tags) {
$html = preg_replace('/<'. $tags .'\b[^>]*>(.*?)<\/'. $tags .'>/is', "", $html);
return $html;
}
$url = 'https://gossipgist.com/kenneth-petty';
if(isset($_GET['url'])){
$url = $_GET['url'];
}
$prefix = 'hton_';
$meta = array();
$client = new Client();
$crawler = $client->request('GET', $url);
$meta[$prefix.'background_full_name'] = get_meta($crawler, 'name');
$meta[$prefix.'background_dob'] = get_meta($crawler, 'birthDate');
$meta[$prefix.'body_gender'] = get_meta($crawler, 'gender');
$meta[$prefix.'background_profession'] = get_meta($crawler, 'jobTitle');
$meta[$prefix.'background_nicknames'] = get_meta($crawler, 'givenName');
$name = array_shift($crawler->filter('h1.entry-title')->extract('_text'));
$image = $crawler->filter('img.profile-picture')->attr('src');
if(empty($image)){
try{
$image = $crawler->filterXpath('//meta[@property="og:image"]')->attr('content');
}catch (Exception $e){
$image = '';
}
}
$html = $crawler->filter('article#biography')->html();
$remove_tags_and_content = array('table','figcaption','div','iframe','font');
foreach($remove_tags_and_content as $tg){
$html = strip_tags_and_content($html, $tg);
}
$html = preg_replace('/(<(script|style)\b[^>]*>).*?(<\/\2>)/is', "$1$3", $html);
$strip_tags = "center|style|span|ins|div|script|a|img|input|button|figure";
$html = preg_replace('#<a.*?>.*?</a>#i', '', $html);
$html = preg_replace("#<\s*\/?(".$strip_tags.")\s*[^>]*?>#im", '', $html);
$html = preg_replace("#<p[^>]*>(\s| |</?\s?br\s?/?>)*</?p>#", '', $html);
// Remove empty paragraphs
$html = str_replace("<strong> </strong>","",$html);
$html = str_replace("<p></p>","",$html);
$html = str_replace("<p><br></p>","",$html);
$html = trim($html);
$textToSpin = str_replace('</div>','',$html);
// echo $textToSpin;exit;
$response = spinbot($textToSpin);
// echo $response;exit;
// Make the response readable
list($strResponseHeaders, $strResponseBody) = explode("\r\n\r\n", $response, 2);
list($strResponseHeaders, $strResponseBody) = explode("\r\n\r\n", $strResponseBody, 2);
$aHeaders = putHeadersTextIntoArray($strResponseHeaders);
$out = $strResponseBody;
//$out = substr(strstr($strResponseBody, '<p>'), strlen('<p>'));
//$out = $html;
//$out = $strResponseBody;
$fields['name'] = $name;
$fields['image'] = $image;
$fields['meta'] = $meta;
$fields['body'] = $out;
$fields['available-spins'] = $aHeaders['available-spins'];
header('Content-Type: application/json');
echo json_encode($fields);
function strip_tags_and_content($html,$tag){
return preg_replace('/<'.$tag.'[^>]*>([\s\S]*?)<\/'.$tag.'[^>]*>/', '', $html);
}
function curlTest(){
$url = 'https://healthyton.com';
$ch = curl_init();
$timeout = 10;
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
function spinbot($textToSpin){
// echo $textToSpin;exit;
// debug(curlTest());
$url = 'https://api.spinbot.com';
$header = array();
$spinbotApiKey = '87b0d1bb9b3c420381ee141837cbc7f5';
$header[] = "x-auth-key:$spinbotApiKey";
$header[] = 'x-spin-cap-words:true';
$header[] = 'x-words-to-skip:rewrit,nonExistentWordPart';
$header[] = 'x-min-percent-change-per-sentence:any';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $textToSpin);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
curl_setopt($ch, CURLOPT_VERBOSE, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
// echo 'curl here';
$response = curl_exec($ch);
// debug($response);
curl_close($ch);
return $response;
}
function cleanDate($date){
$date = trim($date);
$date = str_replace(',','',$date);
$date = str_replace('.','',$date);
$date = str_replace('th','',$date);
$date = str_replace('st','',$date);
$date = str_replace('rd','',$date);
$date = str_replace('<sup></sup>','',$date);
try{
$dt = new DateTime($date);
}catch (Exception $e){
debug($e);
}
return $dt->format('Y-m-d');
}
function putHeadersTextIntoArray($header_text) {
$headers = array();
foreach (explode("\r\n", $header_text) as $i => $line)
if ($i === 0) {
$headers['http_code'] = $line;
} else {
list ($key, $value) = explode(': ', $line);
$headers[$key] = $value;
}
return $headers;
}
function debug($arr, $exit = true){
echo '<pre>';
print_r($arr);
echo '</pre>';
if($exit) exit;
}
function get_meta($crawler, $meta_name){
$meta_info = array_shift($crawler->filterXpath('//td[@itemprop="'.$meta_name.'"]')->extract('_text'));
return $meta_info;
}
function splitName($name) {
$parts = explode(' ', $name);
return array(
'firstname' => array_shift($parts),
'lastname' => array_pop($parts),
'middlename' => join(' ', $parts)
);
}