您好,欢迎来到三六零分类信息网!老站,搜索引擎当天收录,欢迎发信息

解析HTML标签,并实现快速查找节点,获取节点信息

2025/12/21 2:04:04发布27次查看
详细介绍和使用请点击源码出处。 _removenoise($str); if ($str === null) { self::$tagparseerror = true; } else { $l = strpos($str, ' if ($l !== false) { $this->plaintext = substr($str, 0, $l); } $res = preg_match_all('~>(.*?) if ($res !== false && $res > 0) { $this->plaintext .= implode($matches[1]); } $r = strrpos($str, '>'); if ($r !== false) { $this->plaintext .= substr($str, $r+1); } $tagcollect = array(); $attrcollect = array(); $innercontentcollect = array(); if ($this->parsetag($str, $tagcollect, $attrcollect, $innercontentcollect) === false) { self::$tagparseerror = true; } foreach ($tagcollect as $index => $tag) { $this->child[] = new tagdomnode($tag, $this, $attrcollect[$index], $innercontentcollect[$index], $this->level+1); } } } /** * parsetag * * @param mixed $str description. * @param mixed &$tagcollect description. * @param mixed &$attrcollect description. * @param mixed &$innercontentcollect description. * * @access protected * * @return boolean value. */ protected function parsetag($str, array &$tagcollect, array &$attrcollect, array &$innercontentcollect) { $selfclosingtags = array('img' => 1, 'br' => 1, 'input' => 1, 'meta' => 1, 'link' => 1, 'hr' => 1, 'base' => 1, 'embed' => 1, 'spacer' => 1); $end = -2; $close = 0; $error = false; $tag = ''; while (true) { $l = strpos($str, ' if ($l === false) {//parse end break; } if (strpos(substr($str, $l, 2), '/') !== false) {//surplus closing tag,discard $error = true; $end = $l+strlen($tag); self::$errortag[] = substr($str, $l, strpos($str, '>', $l)-$l+1); continue; } $r = strpos($str, '>', $l); $tag = substr($str, $l+1, $r-$l-1); if (!ctype_alpha($tag[0]) || strpos($tag, ' $end = $r + 1; continue; } $tag = preg_replace(~\n+~, ' ', $tag); $space = strpos($tag, ' '); if ($space !== false) { $attrcollect[] = substr($tag, $space+1); $tag = substr($tag, 0, $space); } else { $attrcollect[] = ''; } $tagcollect[] = $tag; if (isset($selfclosingtags[$tag])) { $innercontentcollect[] = ''; $end = $r-strlen($tag)-2; $close = $r+1; continue; } $countopen = -1; $open = strpos($str, ' $close = strpos($str, ''.$tag.'>', $open); if ($close === false) {//surplus opening tag $innercontentcollect[] = substr($str, $r+1); $error = true; self::$errortag[] = ''; break; } $start = $open; while ($open $countopen++; $open = strpos($str, ' } while ($countopen > 0 && $close !== false) { $open = strpos($str, ' $close = strpos($str, ''.$tag.'>', $close+strlen($tag)+3); if ($close === false) { break; } $countopen--; while ($open $open = strpos($str, ' $countopen++; } } if ($close === false) {//标签闭合不配对 $innercontentcollect[] = substr($str, $r+1); $error = true; break; } $end = $close; $r = strpos($str, '>', $start); $innercontentcollect[] = substr($str, $r+1, $end - $r - 1); } return !$error; } /** * _removenoise * * @param string &$str the tag string to be parse. * * @access private * * @return string */ private function _removenoise(&$str) { $str = preg_replace('~~is', '', $str); $str = preg_replace('~~is', '', $str); $str = preg_replace('~*?>~is', '', $str); } /** * parseselectors * * @param string $selectors user's select condition. * @param array &$selectorstag tags * @param array &$selectorsattr attributes * * @access protected * * @return null */ protected function parseselectors($selectors, array &$selectorstag, array &$selectorsattr) { preg_match_all('~([\w\d]+)(\[[\w\d -=._/]+\])?~', $selectors, $matches); $selectorstag = $matches[1]; foreach ($matches[2] as $key => $value) { $selectorsattr[$key] = array(); if ($value !== '') { preg_match_all('~([\w\d-]+)=([\w\d-. _/]+)~', $value, $matches); foreach ($matches[1] as $index => $attr) { $selectorsattr[$key][$attr] = $matches[2][$index]; } } } } /** * find * * @param mixed $selectors user's select condition. * @param array $selectorstag tags. * @param array $selectorsattr attributes. * * @access public * * @return array */ public function find($selectors, $selectorstag = array(), $selectorsattr = array()) { if ($selectors !== null) { $this->parseselectors($selectors, $selectorstag, $selectorsattr); } var_dump($selectorstag, $selectorsattr);exit(); if (!empty($selectorstag)) { $this->seek($selectorstag, $selectorsattr); foreach ($this->child as $key => $node) { $node->find(null, $selectorstag, $selectorsattr); } } if ($selectors !== null) { $res = self::$foundnode; self::$foundnode = array(); return $res; } } /** * findglobal * * @param string $selectors user's select condition. * * @access public * * @return array */ public function findglobal($selectors) { $space = strpos($selectors, ' ', strpos($selectors, ']')); if ($space === false) { return $this->findoneglobal($selectors); } else { $selectorsattr = array(); $selectorstag = array(); $this->findoneglobal(substr($selectors, 0, $space), false); $this->parseselectors(substr($selectors, $space + 1), $selectorstag, $selectorsattr); if (!empty(self::$foundnode) && !empty($selectorstag)) { $nodes = self::$foundnode; self::$foundnode = array(); foreach ($nodes as $key => $node) { $node->seek($selectorstag, $selectorsattr); } } } $res = self::$foundnode; self::$foundnode = array(); return $res; } /** * seek * * @param array $selectorstag tags. * @param array $selectorsattr attributes. * * @access protected * * @return null */ protected function seek($selectorstag, $selectorsattr) { foreach ($this->child as $key => $node) { $isfind = true; if ($node->tag === $selectorstag[0]) { foreach ($selectorsattr[0] as $attrname => $value) { if (isset($node->attr[$attrname]) && (preg_match('~.*? '.$value.' .*?~', $node->attr[$attrname]) > 0 || preg_match('~^'.$value.'$~', $node->attr[$attrname]) > 0 || preg_match('~^'.$value.' ~', $node->attr[$attrname]) > 0 || preg_match('~ '.$value.'$~', $node->attr[$attrname]) > 0) ) { continue; } else { $isfind = false; break; } } } else { $isfind = false; } if ($isfind) { if (count($selectorstag) === 1) { self::$foundnode[] = $node; } else { $node->seek( array_slice($selectorstag, 1), array_slice($selectorsattr, 1) ); } } } } /** * findoneglobal * * @param string $selector user's select condition. * @param bool $isreturn weather return value. * * @access public * * @return array */ public function findoneglobal($selector, $isreturn = true) { preg_match('~([\w\d]+)(\[[\w\d -=._/]+\])?~', $selector, $matches); $tag = $matches[1]; $attr = array(); if (isset($matches[2])) { preg_match_all('~([\w\d-]+)=([\w\d-. _/]+)~', $matches[2], $matches); foreach ($matches[1] as $key => $value) { $attr[$value] = $matches[2][$key]; } } if (isset(self::$tagset[$tag])) { foreach (self::$tagset[$tag] as $attrvalue => $nodearray) { $isfind = true; foreach ($attr as $attrname => $value) { if (preg_match('~'.$attrname.'=.*? '.$value.' .*?~', $attrvalue) || preg_match('~'.$attrname.'='.$value.' .*?~', $attrvalue) || preg_match('~'.$attrname.'=.*? '.$value.'~', $attrvalue) || preg_match('~'.$attrname.'='.$value.'~', $attrvalue) ) { continue; } else { $isfind = false; break; } } if ($isfind) { foreach ($nodearray as $key => $node) { self::$foundnode[] = $node; } } } } if ($isreturn) { $res = self::$foundnode; self::$foundnode = array(); return $res; } }}/*** tagdomnode** @uses tagdomroot** @category tagparse* @package tagparse* @author kun * @copyright 2014 kun* @license http://www.php.net/license/3_01.txt php license 3.01* @version 1.0* @link http://www.blogkun.com* @since 1.0*/class tagdomnode extends tagdomroot{ public $attr = array(); public $parent = null; /** * __construct * * @param mixed $tag tag. * @param mixed $parent parent node. * @param mixed $attr attribute. * @param mixed $innercontent tag content. * @param mixed $level node level. * * @access public * * @return tagdomnode */ public function __construct($tag, $parent, $attr, $innercontent, $level) { $this->tag = $tag; $this->parent = $parent; $this->_parseattr($attr); $this->level = $level; $l = strpos($innercontent, ' if ($l !== false) { $this->plaintext = substr($innercontent, 0, $l); } $res = preg_match_all('~>(.*?) if ($res !== false && $res > 0) { $this->plaintext .= implode($matches[1]); } else { $this->plaintext .= $innercontent; } $r = strrpos($innercontent, '>'); if ($r !== false) { $this->plaintext .= substr($innercontent, $r+1); } $tagcollect = array(); $attrcollect = array(); $innercontentcollect = array(); if ($this->parsetag($innercontent, $tagcollect, $attrcollect, $innercontentcollect) === false) { self::$tagparseerror = true; } foreach ($tagcollect as $index => $tag) { $this->child[] = new tagdomnode($tag, $this, $attrcollect[$index], $innercontentcollect[$index], $this->level+1); } if (!isset(self::$tagset[$this->tag])) { self::$tagset[$this->tag] = array(); } if (!isset(self::$tagset[$this->tag][$attr])) { self::$tagset[$this->tag][$attr] = array(); } self::$tagset[$this->tag][$attr][] = &$this; } /** * _parseattr * * @param string $str attribute string. * * @access public * * @return null */ private function _parseattr($str) { preg_match_all('~(?[\w-]+)=(?.*?)~s', $str, $matches); foreach ($matches['attrname'] as $key => $value) { $this->attr[$value] = $matches['attrvalue'][$key]; } }}
复制代码
该用户其它信息

VIP推荐

免费发布信息,免费发布B2B信息网站平台 - 三六零分类信息网 沪ICP备09012988号-2
企业名录 Product