最近 @甜瓜 (QQ:1069629945) 开发了一套NBA数据采集脚本, 我觉得很赞. 经他允许发布出来和大家分享一些经验:

球员球队: http://data.sports.sohu.com/nba/nba_team_info.php?teamid=1 .. 30

在1到30的循环中抓取球队信息, 球员信息并用id将其关联起来, 脚本如下:

public void Run()
{
Logger.ClearAll();
for(int i=; i<=; i++)
{
Default.Navigate("http://data.sports.sohu.com/nba/nba_team_info.php?teamid="+i);
Default.Ready();
var teamid = i;
var teamname = Default.SelectSingleNode("div.blockA>h2>span");
Logger.Log(teamname.Text());
var teamurl = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li>a");
Logger.Log(teamurl.Text());
var teamcity = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(1)");
Logger.Log(teamcity.Text().Replace("主场所在城市:",""));
var gym = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(2)");
Logger.Log(gym.Text().Replace("主体育馆:",""));
var peoplenum = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(3)");
Logger.Log(peoplenum.Text().Replace("可容纳人数:",""));
var intonba = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(4)");
Logger.Log(intonba.Text().Replace("加入NBA时间:",""));
var champion = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(5)");
Logger.Log(champion.Text().Replace("获总冠军次数:",""));
var coach = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(6)");
Logger.Log(coach.Text().Replace("现任主教练:",""));
DataManager.AppendData("TEAM",
DataEntry.Create()
.Set("teamid", teamid+"")
.Set("teamname", teamname.Text())
.Set("teamurl", teamurl.Text().Replace("主场所在城市:",""))
.Set("gym",gym.Text().Replace("主体育馆:",""))
.Set("peoplenum", peoplenum.Text().Replace("可容纳人数:",""))
.Set("intonba", intonba.Text().Replace("加入NBA时间:",""))
.Set("champion", champion.Text().Replace("获总冠军次数:",""))
.Set("coach", coach.Text().Replace("现任主教练:",""))
);
Logger.Log(i.ToString());
var playelist = Default.SelectNodes("div.tab>table tr");
foreach(var player in playelist)
{
var num = player.SelectSingleNode("TD:eq(0)");
var a = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a");
var url = a.Attr("href");
var playerid = Regex.Match(url, @"\d+").Value;
var playerimageurl = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a>img");
var playername = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>P:eq(0)>A");
var position = player.SelectSingleNode("TD:eq(2)>SPAN:eq(0)");
var height = player.SelectSingleNode("TD:eq(3)");
var weight = player.SelectSingleNode("TD:eq(4)");
var birth = player.SelectSingleNode("TD:eq(5)");
var college = player.SelectSingleNode("TD:eq(6)");
Logger.Log(playerimageurl.Text());
Logger.Log(playername.Text());
Logger.Log(position.Text());
Logger.Log(height.Text());
Logger.Log(weight.Text());
Logger.Log(birth.Text());
Logger.Log(college.Text());
Logger.Log(playerimageurl.Attr("src"));
Logger.Log(playerid);
DataManager.AppendData("player",
DataEntry.Create()
.Set("playerid", playerid)
.Set("teamid", teamid+"")
.Set("playername", playername.Text())
.Set("position", position.Text())
.Set("height",height.Text())
.Set("weight", weight.Text())
.Set("birth", birth.Text())
.Set("college", college.Text())
.Set("num", num.Text())
.Set("playerimageurl",playerimageurl.Attr("src"))
);
}
}
}

比赛信息: http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012

脚本如下:

public void Run()
{
Logger.ClearAll();
Default.Navigate("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012");
Default.Ready();
var games = Default.SelectNodes("div.tab tr>td.e17>span.bluetext>a:contains(\"技术统计\")"); List<string> urls = new List<string>();
foreach(var g in games)
{
var url = new Uri(new Uri("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-10&season_year=2013"), g.Attr("href")).ToString();
urls.Add(url.ToString());
}
foreach(var url in urls)
{
if( Default.Available == false) return;
Default.Navigate(url);
Default.Ready();
var teamNames = Default.SelectNodes("div.blockA>h2");
var scores = Default.SelectNodes("table.tab04 tr");
var scoreslist = Default.SelectNodes("table.tab02 tr>td");
var awayscores = Default.SelectNodes("table.tab02 tr");
var jiashiscores = Default.SelectSingleNode("table.tab03>TD:eq(0)");
var logos = Default.SelectNodes("td.logo img");
var awayid =Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
var homeid =Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
var homescore=scores[].Text();
var awayscore=scores[].Text();
var awayscore1=scoreslist[].Text();
var awayscore2=scoreslist[].Text();
var awayscore3=scoreslist[].Text();
var awayscore4=scoreslist[].Text();
var homescore1=scoreslist[].Text();
var homescore2=scoreslist[].Text();
var homescore3=scoreslist[].Text();
var homescore4=scoreslist[].Text();
var gametime = Default.SelectSingleNode("div.center>h2"); var jiashiawayscores1="";
var jiashiawayscores2="" ;
var jiashiawayscores3 ="";
var jiashiawayscores4="";
var jiashihomescores1="";
var jiashihomescores2="";
var jiashihomescores3 ="";
var jiashihomescores4=""; var td = Default.SelectSingleNode("table.tabBig td:contains(\"加时赛\")");
if(!td.IsEmpty())
{ if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
}
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text(); }
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text(); }
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
jiashiawayscores4 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(3)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text();
jiashihomescores4 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(3)").Text(); } } DataManager.AppendData("GAMESTATIC",
DataEntry.Create()
.Set("teamid", Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
.Set("gametime",gametime.Text().Replace("开始比赛",""))
.Set("score1",awayscore1)
.Set("score2", awayscore2)
.Set("score3",awayscore3)
.Set("score4",awayscore4)
.Set("score", scores[].Text())
.Set("gameid",url)
.Set("status", "")
.Set("jiashiscore1",jiashiawayscores1)
.Set("jiashiscore2",jiashiawayscores2)
.Set("jiashiscore3",jiashiawayscores3)
.Set("jiashiscore4",jiashiawayscores4)
);
DataManager.AppendData("GAMESTATIC",
DataEntry.Create()
.Set("teamid", Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
.Set("gametime",gametime.Text().Replace("开始比赛",""))
.Set("score1",homescore1)
.Set("score2", homescore2)
.Set("score3",homescore3)
.Set("score4",homescore4)
.Set("score", scores[].Text())
.Set("gameid",url)
.Set("status", "")
.Set("jiashiscore1",jiashihomescores1)
.Set("jiashiscore2",jiashihomescores2)
.Set("jiashiscore3",jiashihomescores3)
.Set("jiashiscore4",jiashihomescores4)
); } }

这里的亮点是要看48, 49两行, 这里对加时赛也进行了处理. 不是所有的比赛都有加时赛, 就算有也可以打多场(1-4场). 因此甜瓜非常细心的对这块也做了处理. 个人感觉这块代码也还是有优化的余地, 但是这种处理也非常简单直白, 一目了然, 也是很不错的.

最后运行起来:

文中开发工具Spider Studio (采集工作站)下载地址: http://www.gdtsearch.com/products.spiderstudio.htm. 安装后运行, 将脚本复制进去点"运行"即可看到效果.

Spider Studio QQ群: 45995410

示例 - C#脚本代码采集搜狐NBA球员, 球队和比赛实况的更多相关文章

  1. 使用CURL和火车头软件采集搜狐文章

    直接上代码: //参数1:访问的URL,参数2:post数据(不填则为GET),参数3:提交的$cookies,参数4:是否返回$cookies function curl_request($url, ...

  2. C# 脚本代码自动登录淘宝获取用户信息

    C# 脚本代码自动登录淘宝获取用户信息   最近遇到的一个需求是如何让程序自动登录淘宝, 获取用户名称等信息. 其实这个利用SS (SpiderStudio的简称) 实现起来非常简单. 十数行代码就可 ...

  3. crawler4j源码学习(1):搜狐新闻网新闻标题采集爬虫

    crawler4j是用Java实现的开源网络爬虫.提供了简单易用的接口,可以在几分钟内创建一个多线程网络爬虫.下面实例结合jsoup,采集搜狐新闻网(http://news.sohu.com/)新闻标 ...

  4. 利用朴素贝叶斯分类算法对搜狐新闻进行分类(python)

    数据来源  https://www.sogou.com/labs/resource/cs.php介绍:来自搜狐新闻2012年6月—7月期间国内,国际,体育,社会,娱乐等18个频道的新闻数据,提供URL ...

  5. jquery仿搜狐投票动画代码

    体验效果:http://hovertree.com/texiao/jquery/21/ 这是一款基于jquery实现的仿搜狐投票动画特效源码,运行该源码可见VS图标首先出现在中间位置,紧接着随着投票比 ...

  6. 【HTML&CSS】搜狐页面代码编写

    <!DOCTYPE html> <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"& ...

  7. 搜狐云景paas平台实践之路

    前言: 搜狐云景作为搜狐的paas平台,在2014年5月22日的云计算大会上正式发布了公测.初测,注册用户必须先申请邀请码参与公测会赠送用户100元电子券,经过实名认证之后会再赠送100电子券,目测可 ...

  8. 山寨Unity3D?搜狐畅游的免费开源游戏引擎Genesis-3D

    在CSDN上看到了<搜狐畅游发布3D游戏引擎Genesis-3D 基于MIT协议开源>(http://www.csdn.net/article/2013-11-21/2817585-cha ...

  9. 利用jieba,word2vec,LR进行搜狐新闻文本分类

    一.简介 1)jieba 中文叫做结巴,是一款中文分词工具,https://github.com/fxsjy/jieba 2)word2vec 单词向量化工具,https://radimrehurek ...

随机推荐

  1. 如何使用JW Player来播放Flash并隐藏控制按钮和自定义播放完成后执行的JS

    在一个客户项目中播放的flash需要进行定制如不显示控制按钮,flash播放完成后执行特定的js等,在用过了N多的JQery插件和播放器后最终JW Player插件可以满足我的以上要求 因为JW Pl ...

  2. Nubia Z5S官方4.4 UI2.0音频Audio部分简单分析(也适用于其它8974/8064机型)以及降低破音出现几率的方法

    转载请注明出处和网址链接: http://blog.csdn.net/syhost/article/details/31419749 此篇本是在Z5S的官方4.4内測版出来时写的, 主要是看到其在au ...

  3. JS及JQuery对Html内容编码,Html转义

    1利用jquery /** JQuery Html Encoding.Decoding * 原理是利用JQuery自带的html()和text()函数可以转义Html字符 * 虚拟一个Div通过赋值和 ...

  4. cocos2d-x 音乐与音效

    1.背景音乐 要使用一个音乐,首先要预加载这个音乐,预加载的方法如下 SimpleAudioEngine::sharedEngine()->preloadBackgroundMusic( CCF ...

  5. android java 设计模式详解 Demo

    android java 设计模式详解 最近看了一篇设计模式的文章,深得体会,在此基础我将每种设计模式的案例都写成Demo的形式,方便读者研究学习, 首先先将文章分享给大家: 设计模式(Design ...

  6. QtGui.QComboBox

    The QtGui.QComboBox is a widget that allows a user to choose from a list of options. #!/usr/bin/pyth ...

  7. ajax local.href不跳转的原因之一

    ajax local.href不跳转的原因之一 打开F12发现一直报 next.html is not a function…… 后来发现next少了(),看得我尴尬症都犯了

  8. 请MVC5 WebApi2 支持OData协议查询

    一.配置项 1.WebApiConfig.cs添加如下代码: // api 支持 cors允许Ajax发起跨域的请求(nuget 中搜索 ASP.NET Cross-Origin Support,然后 ...

  9. windows bat文件运行中文乱码

      windows bat文件运行中文乱码 CreationTime--2018年7月17日08点51分 Author:Marydon 1.情景展示 运行bat文件,里面的中文提示显示乱码 2.问题剖 ...

  10. 〖Android〗/system/etc/recovery-resource.dat

    源代码中的解释:[platform_build/tools/releasetools/ota_from_target_files] # Recovery is generated as a patch ...