Log in

View Full Version : Resolved preg_match not matching



Master_script_maker
06-12-2008, 06:37 PM
i'm developing a proxy server for a client that he will put all of his workers through, for internet, so i have to parse everything. The only thing i have a problem with is forms. This is my code
$pageData = preg_replace('/(<form(.*)action="(.*)"(.*)>)(.*)/', '$1<input type="hidden" name="ac_url" value="$3" />$5', $pageData);
for some reason this is not working. any help would be much appreciated.
thanks

Master_script_maker
06-13-2008, 01:54 AM
does anyone have some help

Jas
06-13-2008, 04:24 AM
For one thing, it is set to greedy. You can place a u outside of the pattern, or follow every .* with a ?.

.* = go as far as you can
.*?= stop as soon as you can

Another thing, you might need to be more specific. For example, something like [^>]*? tells it to find everything that is not a > (and the ? tells it to stop at the first >).

jackbenimble4
06-13-2008, 04:31 AM
Whenever I write a regular expression that isn't matching, I break it down and rebuild it slowly piece by piece.

Writing proxies can be a total b!tch. I made a stab at one for fun once, and it worked well enough that it let me pass through a Webwasher filter and browse blocked sites. I remember it had a few problems I gave up on because I really had no reason to further develop it. If you'd like to take a look, here's the crude class I used:


<?php

// A proxy class for creating a proxy to bypass a filter


class jproxy {
protected $proxyBaseUrl = NULL;
protected $currentUrl = NULL;
protected $pageDomain = NULL;
protected $fullContentType = NULL;
protected $contentType = NULL;
protected $html = NULL;

function __construct($baseUrl) {
$this->setBaseUrl($baseUrl);
}

function setBaseUrl($url) {
$this->proxyBaseUrl = $url;
}

function setUrl($url) {
$this->currentUrl = $url;
}

function getUrl() {
return $this->currentUrl;
}

function fetchHtml() {
if(!$this->getUrl()) {
// no url to grab html for
return false;
}

//
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_TIMEOUT, 4);
curl_setopt($ch, CURLOPT_URL, $this->getUrl());
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_COOKIESESSION, true);
curl_setopt($ch, CURLOPT_COOKIEJAR, $GLOBALS['cookieFile']);
curl_setopt($ch, CURLOPT_COOKIEFILE, $GLOBALS['cookieFile']);
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);


if($_POST) {
$count = 1;
foreach($_POST as $key => $val) {
$headers .= $key.'='.$val;
if($count != count($_POST)) {
$headers .= "&";
}
$count++;
}
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $headers);
}

/*foreach($_GET as $key => $val) {
if($key != 'url') {
echo $key . " => " . $val . "<br />";
}
}*/



$this->html = curl_exec($ch);

$this->fullContentType = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
preg_match( '@([\w/+]+)(;\s+charset=(\S+))?@i', $this->fullContentType, $matches );

if(isset($matches[1])) {
$this->contentType = $matches[1];
}


curl_close($ch);
}


function getDomain() {
if($this->pageDomain) {
return $this->pageDomain;
}
else {
preg_match('/http:\/\/(www\.)?([^\/]+)/', $this->currentUrl, $matches);
return "http://".$matches[2]."/";
}
}

protected function modifyHref($url) {
// is a css or favicon file?
if(stripos($url, '.css') || stripos($url, 'style.php') || strpos($url, 'favicon.ico')) {
if(substr($url, 0, 7) == 'http://') {
$new = "href=\"".$this->proxyBaseUrl."?url=".$url."\"";
}
else {
$new = "href=\"".$this->proxyBaseUrl."?url=".$this->getDomain().$url."\"";
}
}
else {
if(substr($url, 0, 7) == 'http://') {
$new = "href=\"".$this->proxyBaseUrl."?url=".$url."\"";
} else {
$new = "href=\"".$this->proxyBaseUrl."?url=".$this->getDomain().$url."\"";
}
}

//$new = str_replace('//','/',$new);

return $new;
}

protected function modifySrc($text) {
if(substr($text, 0, 7) == 'http://') {
$new = "src=\"".$this->proxyBaseUrl."?url=".$text."\"";
} else {
$new = "src=\"".$this->proxyBaseUrl."?url=".$this->getDomain()."/".$text."\"";
}

//$new = str_replace('//','/',$new);

return $new;

}

protected function modifyActions($text) {
if(substr($text, 0, 7) == 'http://')
{
$new = "action=\"".$this->proxyBaseUrl."?url=".$text."\"";
}
else
{
$new = "action=\"".$this->proxyBaseUrl."?url=".$this->getDomain().$text."\"";
}

//$new = str_replace('//','/',$new);
return $new;
}

protected function modifyFlash($text) {
if(substr($text, 0, 7) == 'http://') {
$new = "<param value=\"".$this->proxyBaseUrl."?url=".$text."\"";
}
else {
"<param value=\"?url=".$this->proxyBaseUrl."?url=".$this->getDomain().$text."\"";
}
//$new = str_replace("//","/", $new);

return $new;
}

protected function getExtension() {
preg_match('/\.([^.]+)$/', $this->currentUrl, $matches);
return $matches[0];
}

protected function stripShit() {
$return = str_replace('/','',$this->currentUrl);
$return = str_replace(':','',$return);
$return = str_replace('.','',$return);
return $return;
}

protected function modifyForm($whole, $formAttr, $content) {
$content = '\n\n<!--INSERTED FORM ELEMENT BY PROXY -->\n';
$content .= '<input type=\"hidden\" name=\"url_204s52bg\" value=\"\" />';
$whole = str_replace($content, $myForm.$content, $whole);
return $whole;
}

protected function modifyInlineStyles($props,$content) {
$content = $this->modifyCssLocations($content);
return "<style".$props.">".$content."</style>";
}

protected function modifyLocations() {
// replace href (links, stylesheets, etc.)
$this->html = preg_replace('/href=(\'|")(.+?)\1/ie', '$this->modifyHref("$2")', $this->html);

// replace src (images, etc.)
$this->html = preg_replace('/src=(\'|")(.+?)\1/ie', '$this->modifySrc("$2")', $this->html);

// replace form actions
$this->html = preg_replace('/action=("|\')(.+?)\1/ie', '$this->modifyActions("$2")', $this->html);

// add form url thing
//$this->html = preg_replace('/(<form(.?)>(.?)<\/form>)/', '$this->modifyForm("$1","$2","$3")', $this->hrml);

// replace flash paths
$this->html = preg_replace('/<param value=("|\')(.+?)\1/ie', '$this->modifyFlash("$2")', $this->html);

// replace any @imports in inline stylesheets
$this->html = preg_replace('/<style([^>]*)>(.+?)<\/style>/ie', '$this->modifyInlineStyles("$1","$2")', $this->html);

$this->html = stripslashes($this->html);
}

private function modifyCssUrl($url) {

if(substr($url, 0, 7) == 'http://') {
$return = $this->proxyBaseUrl.'?url='.$url;
}
else {
$return = $this->proxyBaseUrl.'?url='.$this->getDomain().$url;
}

return 'url('.$return.')';
}

private function modifyCssImport($file) {
if(substr($url, 0, 7) == 'http://') {
$return = $this->proxyBaseUrl.'?url='.$file;
}
else {
$return = $this->proxyBaseUrl.'?url='.$this->getDomain().$file;
}

return '@import "'.$return.'";';
}

private function modifyCssLocations($content) {
$content = preg_replace('/url\((.+?)\)/ie', '$this->modifyCssUrl("$1")', $content);

// replace imports
$content = preg_replace('/@import(|\s)(\'|")(.+?)\2;/ie','$this->modifyCssImport("$3")', $content);

return $content;
}

function getHtml() {
return $this->html;
}

function getFormattedPage() {

if(
($this->contentType != NULL) && (stripos($this->contentType,'text/html') === false || stripos($this->contentType,'application/xhtml') != false)
) {
header("Content-Type: ".$this->fullContentType);
if(stripos($this->contentType,'css')) {
// css file, modify that *****
$this->html = $this->modifyCssLocations($this->html);
}
}
else {
// we only want to modify locations if it's a regular old webpage
$this->modifyLocations();
}

//$this->modifyLocations();

return $this->html;
}

}

?>

And I used the class like so:



<?php


if(!$_COOKIE['proxSession']) {
// set the cookie
setcookie('proxSession', md5('prox'.microtime().$_SERVER['HTTP_USER_AGENT']), time()+60*20, '/', '.example.com');
}
else {
// rewnew the time for the cookie
setcookie('proxSession', $_COOKIE['proxSession'], time()+60*20,
'/', '.example.com');
}

$cookieFile = "cook_".$_COOKIE['proxSession'];

// get the class
require_once('jproxy.class.php');


if($_GET['url']) {
$p = new jproxy("http://www.example.com/page.php");
$p->setUrl($_GET['url']);
$p->fetchHtml();

print $p->getFormattedPage();
}

?>


Looking back at the class, it looks like I used this regular expression to match forms:


// replace form actions
$this->html = preg_replace('/action=("|\')(.+?)\1/ie', '$this->modifyActions("$2")', $this->html);

Which used the function:



protected function modifyActions($text) {
if(substr($text, 0, 7) == 'http://')
{
$new = "action=\"".$this->proxyBaseUrl."?url=".$text."\"";
}
else
{
$new = "action=\"".$this->proxyBaseUrl."?url=".$this->getDomain().$text."\"";
}

//$new = str_replace('//','/',$new);
return $new;
}


I hope some of that was some help. Sorry I couldn't help with your expression, but I don't have time to break it down and test it.

Master_script_maker
06-13-2008, 03:47 PM
thanks guys. i didn't find the problem, but rewrote and it worked fine.