Log in

View Full Version : BBCode Parser



jackbenimble4
06-05-2008, 08:17 PM
I've been working on building a custom forum software from the ground up the past few months. When I got to parsing messages, I wrote a TextParser class to handle bbcode, smilies and the like. It worked well until I realized that embedded bbcodes of the same type weren't being parsed.

I found some discussions on the topic and they pointed to using preg_replace_callback to recursively replace the bbcodes. I had some trouble implementing it and got some strange results. In search for more answers I came across a blog post where someone said to avoid regular expressions to parse bbcode structures. How would I not use regular expressions to parse bbcode? Are there any other alternatives?

For the curious, here's my class. My attempt at recursion is commented out and the original attempt isn't.


<?php

class Penelope_TextParser {

private $BBCodes = array();
private $smilies = array();
private $currentReplacement;
private $currentBBCode;

public function __construct() {

$this->loadCodes();
$this->loadSmilies();

}

private function loadCodes() {

// this function loads all the bbcodes in the database into the object for use later when parsing is needed.

// get the database connection
$db = Penelope_DB::getInstance();

$getCodes = $db->prepare("SELECT * FROM bbcodes");
$getCodes->execute();
$this->BBCodes = $getCodes->fetchAll(PDO::FETCH_ASSOC);

if($getCodes->errorCode() == "0000") {
return true;
}
else {
return false;
}

}

private function loadSmilies() {

// this function loads all the smilies in the database into the object for use later when parsing

$db = Penelope_DB::getInstance();

$getSmilies = $db->prepare("SELECT * FROM smilies");
$getSmilies->execute();
$this->smilies = $getSmilies->fetchAll(PDO::FETCH_ASSOC);

if($getSmilies->errorCode() == "0000")
{
return true;
}
else {
return false;
}

}

private function prepareReplacement($replacement, $use_option) {

if($use_option) {
$replacement = preg_replace('/{option}/', '\$1', $replacement);

$replacement = preg_replace('/{param}/', '\$2', $replacement);
}
else {
$replacement = preg_replace('/{param}/', '\$1', $replacement);
}

return $replacement;

}

public function parse($text) {

$text = htmlentities($text);
$text = $this->parseBBCode($text);
$text = $this->parseSmilies($text);

$text = nl2br($text);

return $text;

}

protected function recursiveReplaceSimple($input) {

//$arr = $this->currentBBCode;

if(is_array($input)) {

$replacement = $this->currentReplacement;

$input = str_replace('{param}', $input[1], $replacement);
}

return preg_replace_callback('/\['.$arr['tag'].'\](.+)\[\/'.$arr['tag'].'\]/is', array(&$this, 'recursiveReplaceSimple'), $input, 5);

}

protected function recursiveReplaceAdvanced($input) {

$arr = $this->currentBBCode;

if(is_array($input)) {

$replacement = $this->currentReplacement;

$option = $input[1];
$param = $input[2];

$input = str_replace('{param}', $param, $replacement);
$input = str_replace('{option}', $option, $input);
}

return preg_replace_callback('/\['.$arr['tag'].'\](.+)\[\/'.$arr['tag'].'\]/is', array(&$this, 'recursiveReplaceAdvanced'), $input, 5);

}

public function parseBBCode($text) {

foreach($this->BBCodes as $key => $arr)
{

if(!$arr['use_option']) {

/*
$this->currentReplacement = $arr['replacement'];
$this->currentBBcode = $arr;

// RECURSIVE REPLACEMENT

$text = preg_replace_callback('/\['.$arr['tag'].'\](.+)\[\/'.$arr['tag'].'\]/is', array(&$this, 'recursiveReplaceSimple'), $text, 5);
*/

//ORIGINAL REPLACEMENT:

$replacement = $this->prepareReplacement($arr['replacement'], 0);

$text = preg_replace('/\['.$arr['tag'].'\](.+)\[\/'.$arr['tag'].'\]/isU', $replacement, $text);



}

else {
/*
$this->currentReplacement = $arr['replacement'];
$this->currentBBCode = $arr;

// RECURSIVE REPLACEMENT
$text = preg_replace_callback('/\['.$arr['tag'].'=([^\]]+)\](.+)\[\/'.$arr['tag'].'\]/is', array(&$this, 'recursiveReplaceAdvanced'), $text, 5);
*/

//ORIGINAL REPLACEMENT:

$replacement = $this->prepareReplacement($arr['replacement'], 1);

$text = preg_replace('/\['.$arr['tag'].'=([^\]]+)\](.+)\[\/'.$arr['tag'].'\]/isU', $replacement, $text);


}

}

return $text;

}

public function parseSmilies($text) {

$config = Penelope_Config::getInstance();
$website_url = $config->getProp("website_url");

foreach($this->smilies as $key => $arr)
{
$text = str_replace($arr['search'], '<img src="'.$website_url.'/imgs/smilies/'.$arr['filepath'].'" alt="'.$arr['name'].'" />', $text);
}

return $text;

}

public function addDefaultBBCode($text, $font, $color, $size) {

$tmp = "";

if($font) {
$tmp .= "";
}
if($color) {
$tmp .= "";
}
if($size) {
$tmp .= "";
}

$tmp .= $text;

if($size) {
$tmp .= "";
}
if($color) {
$tmp .= "";
}
if($font) {
$tmp .= "";
}

return $tmp;

}

}

?>

jackbenimble4
06-07-2008, 07:28 PM
After many many hours of fighting with this damn problem, I think I've finally reached a solution. I ended up creating a parser that converts the bbcode into a tree structure, and then converts the tree structure into html. I really like this solution, because if I ever had to parse into a different format instead of html, all I need is a new conversion table. For those who are curious, here is my new class.


<?php

class Penelope_TextParser {

private $supportedTags = array();
private $smilies = array();

private $text = "";
private $tree = array();


public function setInput($text) {

$this->text = htmlentities($text);
$this->resetTree();

return true;

}

public function resetTree() {
$this->tree = array("nodeid" => 0, "parentNodeId" => -1, "type" => "element", "tagname" => "body", "children" => array());
}

public function __construct() {

$this->resetTree();

$this->loadCodes();
$this->loadSmilies();

// Initialize built in bbcodes
/*
$this->addBBCode("b", 0, "<strong>{param}</strong>", 1);
$this->addBBCode("i", 0, "<em>{param}</em>", 1);
$this->addBBCode("url", 1, "<a href=\"{option}\">{param}</a>", 0);
$this->addBBCode("color", 1, "<span style=\"color: {option};\">{param}</span>", 1);
$this->addBBCode("img", 0, "<img src=\"{param}\" />", 0);*/


}

private function loadCodes() {

// this function loads all the bbcodes in the database into the object for use later when parsing is needed.

// get the database connection
$db = Penelope_DB::getInstance();

$getCodes = $db->prepare("SELECT * FROM bbcodes");
$getCodes->execute();
while($code = $getCodes->fetch(PDO::FETCH_ASSOC)) {
$this->addBBCode($code['tag'], $code['use_option'], $code['replacement'], $code['parse_content']);
}

if($getCodes->errorCode() == "0000") {
return true;
}
else {
return false;
}

}

private function loadSmilies() {

// this function loads all the smilies in the database into the object for use later when parsing

$db = Penelope_DB::getInstance();

$getSmilies = $db->prepare("SELECT * FROM smilies");
$getSmilies->execute();
$this->smilies = $getSmilies->fetchAll(PDO::FETCH_ASSOC);

if($getSmilies->errorCode() == "0000")
{
return true;
}
else {
return false;
}

}

public function parse($text = NULL) {

if($text != NULL) {
$this->setInput($text);
}

// make the bbcode tree
$this->constructTree();
$result = $this->parseTreeAsHTML();

$result = $this->parseSmilies($result);

$result = nl2br($result);

return $result;

}

public function parseSmilies($text) {

$config = Penelope_Config::getInstance();
$website_url = $config->getProp("website_url");

foreach($this->smilies as $key => $arr)
{
$text = str_replace($arr['search'], '<img src="'.$website_url.'/imgs/smilies/'.$arr['filepath'].'" alt="'.$arr['name'].'" />', $text);
}

return $text;

}

// Tree Parsing Functions

public function parseTreeAsHTML() {

$parsedText = "";
foreach($this->tree['children'] as $key => $arr) {
$parsedText .= $this->parseAsHTML($arr, 1);
}

return $parsedText;

}

private function parseAsHTML($el, $parse_content) {

$parsedText = "";

if($el['type'] == 'text') {
// regular text is easy, just print that baby!
$parsedText .= $el['data'];
}
else if($parse_content == 1) {

if($el['attribute']) {
// there's an attribute, so find the bbcode that matches both the tagname and the option
$replacement = $this->getReplacement($el['tagname'], 1);

if($replacement) {

// replace the {option} with the attribute we found
$use_option = 1;
$replacement = str_replace('{option}', $el['attribute'], $replacement);
}
}
else {
// find the bbcode with the right tagname and has no option
$replacement = $this->getReplacement($el['tagname'], 0);
$use_option = 0;
if($replacement === false) {
// there's no bbcode for this.
// Maybe they want to use the bbcode with an option, but shorthand

if(count($el['children']) == 1 AND $el['children'][0]['type'] == "text") {
// the element only has 1 child. Let's use the child as the option.
$replacement = $this->getReplacement($el['tagname'], 1);
if($replacement) {
$replacement = str_replace('{option}', $el['children'][0]['data'], $replacement);
$use_option = 1;
}
}
}
}

if($replacement) {
// get the two parts that are split by the {param}
$replacementParts = explode('{param}', $replacement);

}
else {
// okay, this tag isn't supported. Just show the raw tag.

if($el['attribute']) {
$replacementParts[0] = "[".$el['tagname']."=".$el['attribute']."]";
}
else {
$replacementParts[0] = "[".$el['tagname']."]";
}

$replacementParts[1] = "[/".$el['tagname']."]";

}

// print the first part,
$parsedText .= $replacementParts[0];


// print any children
if(count($el['children']) > 0) {
$parse_content = $this->getParseContent($el['tagname'], $use_option);

foreach($el['children'] as $key => $arr) {
$parsedText .= $this->parseAsHTML($arr, $parse_content);
}
}

if($replacementParts[1]) {
// now print the second part (usually a closing tag)
$parsedText .= $replacementParts[1];
}
}
else {
// Don't parse any elements...
if($el['attribute']) {
$replacementParts[0] = "[".$el['tagname']."=".$el['attribute']."]";
}
else {
$replacementParts[0] = "[".$el['tagname']."]";
}

$replacementParts[1] = "[/".$el['tagname']."]";

$parsedText .= $replacementParts[0];

// print any children
if(count($el['children']) > 0) {
foreach($el['children'] as $key => $arr) {
$parsedText .= $this->parseAsHTML($arr, $parse_content);
}
}

$parsedText .= $replacementParts[1];

}

return $parsedText;
}


private function getReplacement($tagname, $option = 0) {

foreach($this->supportedTags as $key => $arr) {
if($arr['tagname'] == $tagname && $arr['option'] == $option)
{

return $arr['replacement'];
}
}

return false;

}

private function getParseContent($tagname, $option = 0) {

foreach($this->supportedTags as $key => $arr) {
if($arr['tagname'] == $tagname && $arr['option'] == $option)
{

return $arr['parse_content'];
}
}

return false;

}

// Tree Manipulation

public function constructTree() {

$text = $this->text;

$data = preg_split('/([\[\]])/', $text, -1, PREG_SPLIT_DELIM_CAPTURE);

$inTag = false;
$endingTag = false;
$tagName = null;
$nodeid = 0;
$parentNodeId = 0;
$attribute = null;

foreach($data as $key => $piece) {

$nodeid++;

if($piece == '[') {
$inTag = true;
}
else if($piece == ']' && $inTag === true) {
$inTag = false;

if(!$endingTag) {
// add the element
$this->_treeAppendChild($parentNodeId, array(
"nodeid" => $nodeid,
"parentNodeId" => $parentNodeId,
"type" => "element",
"tagname" => $tagName,
"attribute" => $attribute,
"children" => array()));

$parentNodeId = $nodeid;
}
else {
// we're ending this element
$endingTag = false;

$parent = $this->_treeGetElementById($parentNodeId);
$parentNodeId = $parent['parentNodeId'];

}
}

else {
if($inTag) {
// $piece is a tag name
if(substr($piece, 0, 1) == '/') {
// ending tag

$tagName = null;
$endingTag = true;

}
else {
// beginning tag
if(stripos($piece, '=')) {
// there's an attribute
$parts = explode('=', $piece);
$tagName = $parts[0];
$attribute = $parts[1];
}
else {
$attribute = null;
$tagName = $piece;
}
}
}
else {
// $piece is regular text, add it to it's parent
$this->_treeAppendChild($parentNodeId, array(
"nodeid" => $nodeid,
"parentNodeId" => $parentNodeId,
"type" => "text",
"data" => $piece));

}
}


}
}

private function _treeAppendChild($parentId, $elInfo) {

$parent =& $this->_treeGetElementById($parentId);

if(isset($parent['children']) && is_array($parent['children'])) {
array_push($parent['children'], $elInfo);
return true;
}

else {
// nothing to add the element to
return false;
}

}

private function &_treeGetElementById($id) {

return $this->_treeGetElementByIdHelper($id, &$this->tree);

}

private function &_treeGetElementByIdHelper($id, &$arr) {
if($arr['nodeid'] == $id) {
// this is the array we're looking for!
// return a reference to the array!
return $arr;
}
else {
if(is_array($arr['children']) && count($arr['children']) > 0) {
// loop through the children
foreach($arr['children'] as $key => $child) {
$result = & $this->_treeGetElementByIdHelper($id, &$arr['children'][$key]);

if($result != false) {
// they found it
return $result;
}
}
}
}

return false;
}

// BBCodes List Manipulation

public function addBBCode($tagname, $use_option, $replacement, $parse_content) {

$this->supportedTags[] = array(
"tagname" => $tagname,
"option" => $use_option,
"replacement" => $replacement,
"parse_content" => $parse_content);

}

public function printSupportedBBCodes() {
echo "<p><strong>Supported BBCodes:</strong></p>";
echo "<pre>";
foreach($this->supportedTags as $key => $arr) {
$text = print_r($arr, true);
echo htmlentities($text);
}
echo "</pre>";

}


}

?>