PHP抓取、分析国内视频网站的视频信息工具类

使用方法：
复制代码代码如下:
require_once videourlparser.class.php;
$url = http://v.youku.com/v_show/id_xmjkwmzc0njg4.html;
$info = vediourlparser::parse($url);
echo $info;
说明：调用该工具php文件videourlparser.class.php，$url变量后面的字符串为视频页的地址，然后使用echo输出变量$info。
附：info含有的几个值，分别是img（用于视频缩略图），title（视频标题），url（地址），swf（视频swf播放地址）。我只用到了img和swf地址。具体的可以根据自己的需要进行调整。
videourlparser类源码：
复制代码代码如下:
iid == $iid) {
                        break;
                    }
                }
            }
            $data['img'] = $val->pic;
            $data['title'] = $val->title;
            $data['url'] = $url;
            $data['swf'] = http://www.tudou.com/l/{$icode}/&iid={$iid}/v.swf;
            return $data;
        }
        $host = www.tudou.com;
        $path = /v/{$matches[1]}/v.swf;
        $ret = self::_fsget($path, $host);
        if (preg_match(#\nlocation: (.*)\n#, $ret, $mat)) {
            parse_str(parse_url(urldecode($mat[1]), php_url_query));
            $data['img'] = $snap_pic;
            $data['title'] = $title;
            $data['url'] = $url;
            $data['swf'] = http://www.tudou.com/v/{$matches[1]}/v.swf;
            return $data;
        }
        return false;
    }
    /**
     * 酷6网
     * http://v.ku6.com/film/show_520/3x93vo4tis7uothg.html
     * http://v.ku6.com/special/show_4926690/klze2mhmesk6g05x.html
     * http://v.ku6.com/show/7us-kdxjykyiindevhpwhg...html
     * http://player.ku6.com/refer/3x93vo4tis7uothg/v.swf
     */
    private function _parseku6($url){
        if(preg_match(/show\_/, $url)){
            preg_match(#/([-\w]+)\.html#, $url, $matches);
            $url = http://v.ku6.com/fetchvideo4player/{$matches[1]}.html;
            $html = self::_fget($url);
            if ($html) {
                $json = json_decode($html, true);
                if(!$json) return false;
$data['img'] = $json['data']['picpath'];
                $data['title'] = $json['data']['t'];
                $data['url'] = $url;
                $data['swf'] = http://player.ku6.com/refer/{$matches[1]}/v.swf;
                return $data;
            } else {
                return false;
            }
        }elseif(preg_match(/show\//, $url, $matches)){
            $html = self::_fget($url);
            preg_match(/objectinfo\s?=\s?([^\n]*)};/si, $html, $matches);
            $str = $matches[1];
            // img
            preg_match(/cover\s?:\s?\([^\]+)\/, $str, $matches);
            $data['img'] = $matches[1];
            // title
            preg_match(/title\?\s?:\s?\([^\]+)\/, $str, $matches);
            $jsstr = {\title\:\{$matches[1]}\};
            $json = json_decode($jsstr, true);
            $data['title'] = $json['title'];
            // url
            $data['url'] = $url;
            // query
            preg_match(/\(vid=[^\]+)\\sname=\flashvars\/s, $html, $matches);
            $query = str_replace(&, '&', $matches[1]);
            preg_match(/\/\/player\.ku6cdn\.com[^\\']+/, $html, $matches);
            $data['swf'] = 'http:'.$matches[0].'?'.$query;
return $data;
        }
    }
    /**
     * 56网
     * http://www.56.com/u73/v_ntkzmdcwndy.html
     * http://player.56.com/v_ntkzmdcwndy.swf
     */
    private function _parse56($url){
        preg_match(#/v_(\w+)\.html#, $url, $matches);
        if (empty($matches)) return false;
        $link=http://vxml.56.com/json/{$matches[1]}/?src=out;
        $retval = self::_cget($link);
        if ($retval) {
            $json = json_decode($retval, true);
            $data['img'] = $json['info']['img'];
            $data['title'] = $json['info']['subject'];
            $data['url'] = $url;
            $data['swf'] = http://player.56.com/v_{$matches[1]}.swf;
            return $data;
        } else {
            return false;
        }
    }
    /**
     * 乐视网
     * http://www.letv.com/ptv/vplay/1168109.html
     * http://www.letv.com/player/x1168109.swf
     */
    private function _parseletv($url){
        $html = self::_fget($url);
        preg_match(#http://v.t.sina.com.cn/([^'\]*)#, $html, $matches);
        parse_str(parse_url(urldecode($matches[0]), php_url_query));
        preg_match(#vplay/(\d+)#, $url, $matches);
        $data['img'] = $pic;
        $data['title'] = $title;
        $data['url'] = $url;
        $data['swf'] = http://www.letv.com/player/x{$matches[1]}.swf;
        return $data;
    }
    // 搜狐tv http://my.tv.sohu.com/u/vw/5101536
    private function _parsesohu($url){
        $html = self::_fget($url);
        $html = iconv(gb2312, utf-8, $html);
        preg_match_all(/og:(?:title|image|videosrc)\\scontent=\([^\]+)\/s, $html, $matches);
        $data['img'] = $matches[1][1];
        $data['title'] = $matches[1][0];
        $data['url'] = $url;
        $data['swf'] = $matches[1][2];
        return $data;
    }
/*
     * 新浪播客
     * http://video.sina.com.cn/v/b/48717043-1290055681.html
     * http://you.video.sina.com.cn/api/sinawebapi/outplayrefer.php/vid=48717043_1290055681_puzksndrdzxk+l1lhz2stqkp7kqnt6nki2o0u1ehiwzyq0/xm5gdatog5ynsa9keqdhaqja4dpkm0x4/s.swf
     */
    private function _parsesina($url){
        preg_match(/(\d+)(?:\-|\_)(\d+)/, $url, $matches);
        $url = http://video.sina.com.cn/v/b/{$matches[1]}-{$matches[2]}.html;
        $html = self::_fget($url);
        preg_match(/video\s?:\s?([^        $find = array(/\n/, /\s*/, /\'/, /\{([^:,]+):/, /,([^:]+):/, /:[^\d\]\w+[^\,]*,/i);
        $replace = array('', '', '', '{\\1:', ',\\1:', ':,');
        $str = preg_replace($find, $replace, $matches[1]);
        $arr = json_decode($str, true);
        $data['img'] = $arr['pic'];
        $data['title'] = $arr['title'];
        $data['url'] = $url;
        $data['swf'] = $arr['swfoutsideurl'];
return $data;
    }
    /*
     * 通过 file_get_contents 获取内容
     */
    private function _fget($url=''){
        if(!$url) return false;
        $html = file_get_contents($url);
        // 判断是否gzip压缩
        if($dehtml = self::_gzdecode($html))
            return $dehtml;
        else
            return $html;
    }
    /*
     * 通过 fsockopen 获取内容
     */
    private function _fsget($path='/', $host='', $user_agent=''){
        if(!$path || !$host) return false;
        $user_agent = $user_agent ? $user_agent : self::user_agent;
        $out = get $path http/1.1
host: $host
user-agent: $user_agent
accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
accept-language: zh-cn,zh;q=0.5
accept-charset: gb2312,utf-8;q=0.7,*;q=0.7\r\n\r\n
header;
        $fp = @fsockopen($host, 80, $errno, $errstr, 10);
        if (!$fp) return false;
        if(!fputs($fp, $out)) return false;
        while ( !feof($fp) ) {
            $html .= fgets($fp, 1024);
        }
        fclose($fp);
        // 判断是否gzip压缩
        if($dehtml = self::_gzdecode($html))
            return $dehtml;
        else
            return $html;
    }
    /*
     * 通过 curl 获取内容
     */
    private function _cget($url='', $user_agent=''){
        if(!$url) return;
        $user_agent = $user_agent ? $user_agent : self::user_agent;
        $ch = curl_init();
        curl_setopt($ch, curlopt_url, $url);
        curl_setopt($ch, curlopt_header, 0);
        if(strlen($user_agent)) curl_setopt($ch, curlopt_useragent, $user_agent);
        ob_start();
        curl_exec($ch);
        $html = ob_get_contents();
        ob_end_clean();
        if(curl_errno($ch)){
            curl_close($ch);
            return false;
        }
        curl_close($ch);
        if(!is_string($html) || !strlen($html)){
            return false;
        }
        return $html;
        // 判断是否gzip压缩
        if($dehtml = self::_gzdecode($html))
            return $dehtml;
        else
            return $html;
    }
private function _gzdecode($data) {
        $len = strlen ( $data );
        if ($len             return null; // not gzip format (see rfc 1952)
        }
        $method = ord ( substr ( $data, 2, 1 ) ); // compression method
        $flags = ord ( substr ( $data, 3, 1 ) ); // flags
        if ($flags & 31 != $flags) {
            // reserved bits are set -- not allowed by rfc 1952
            return null;
        }
        // note: $mtime may be negative (php integer limitations)
        $mtime = unpack ( v, substr ( $data, 4, 4 ) );
        $mtime = $mtime [1];
        $xfl = substr ( $data, 8, 1 );
        $os = substr ( $data, 8, 1 );
        $headerlen = 10;
        $extralen = 0;
        $extra = ;
        if ($flags & 4) {
            // 2-byte length prefixed extra data in header
            if ($len - $headerlen - 2                 return false; // invalid format
            }
            $extralen = unpack ( v, substr ( $data, 8, 2 ) );
            $extralen = $extralen [1];
            if ($len - $headerlen - 2 - $extralen                 return false; // invalid format
            }
            $extra = substr ( $data, 10, $extralen );
            $headerlen += 2 + $extralen;
        }
$filenamelen = 0;
        $filename = ;
        if ($flags & 8) {
            // c-style string file name data in header
            if ($len - $headerlen - 1                 return false; // invalid format
            }
            $filenamelen = strpos ( substr ( $data, 8 + $extralen ), chr ( 0 ) );
            if ($filenamelen === false || $len - $headerlen - $filenamelen - 1                 return false; // invalid format
            }
            $filename = substr ( $data, $headerlen, $filenamelen );
            $headerlen += $filenamelen + 1;
        }
$commentlen = 0;
        $comment = ;
        if ($flags & 16) {
            // c-style string comment data in header
            if ($len - $headerlen - 1                 return false; // invalid format
            }
            $commentlen = strpos ( substr ( $data, 8 + $extralen + $filenamelen ), chr ( 0 ) );
            if ($commentlen === false || $len - $headerlen - $commentlen - 1                 return false; // invalid header format
            }
            $comment = substr ( $data, $headerlen, $commentlen );
            $headerlen += $commentlen + 1;
        }
$headercrc = ;
        if ($flags & 1) {
            // 2-bytes (lowest order) of crc32 on header present
            if ($len - $headerlen - 2                 return false; // invalid format
            }
            $calccrc = crc32 ( substr ( $data, 0, $headerlen ) ) & 0xffff;
            $headercrc = unpack ( v, substr ( $data, $headerlen, 2 ) );
            $headercrc = $headercrc [1];
            if ($headercrc != $calccrc) {
                return false; // bad header crc
            }
            $headerlen += 2;
        }
// gzip footer - these be negative due to php's limitations
        $datacrc = unpack ( v, substr ( $data, - 8, 4 ) );
        $datacrc = $datacrc [1];
        $isize = unpack ( v, substr ( $data, - 4 ) );
        $isize = $isize [1];
// perform the decompression:
        $bodylen = $len - $headerlen - 8;
        if ($bodylen             // this should never happen - implementation bug!
            return null;
        }
        $body = substr ( $data, $headerlen, $bodylen );
        $data = ;
        if ($bodylen > 0) {
            switch ($method) {
                case 8 :
                    // currently the only supported compression method:
                    $data = gzinflate ( $body );
                    break;
                default :
                    // unknown compression method
                    return false;
            }
        } else {
            //...
        }
if ($isize != strlen ( $data ) || crc32 ( $data ) != $datacrc) {
            // bad format! length or crc doesn't match!
            return false;
        }
        return $data;
    }
}
http://www.bkjia.com/phpjc/748677.htmlwww.bkjia.comtruehttp://www.bkjia.com/phpjc/748677.htmltecharticle使用方法：复制代码代码如下: require_once videourlparser.class.php; $url = http://v.youku.com/v_show/id_xmjkwmzc0njg4.html; $info = vediourlparser::parse($url); e...

PHP抓取、分析国内视频网站的视频信息工具类_PHP教程

VIP推荐