最近 @甜瓜 (QQ:1069629945) 开发了一套NBA数据采集脚本, 我觉得很赞. 经他允许发布出来和大家分享一些经验:

球员球队: http://data.sports.sohu.com/nba/nba_team_info.php?teamid=1 .. 30

在1到30的循环中抓取球队信息, 球员信息并用id将其关联起来, 脚本如下:

public void Run()
{
Logger.ClearAll();
for(int i=; i<=; i++)
{
Default.Navigate("http://data.sports.sohu.com/nba/nba_team_info.php?teamid="+i);
Default.Ready();
var teamid = i;
var teamname = Default.SelectSingleNode("div.blockA>h2>span");
Logger.Log(teamname.Text());
var teamurl = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li>a");
Logger.Log(teamurl.Text());
var teamcity = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(1)");
Logger.Log(teamcity.Text().Replace("主场所在城市:",""));
var gym = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(2)");
Logger.Log(gym.Text().Replace("主体育馆:",""));
var peoplenum = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(3)");
Logger.Log(peoplenum.Text().Replace("可容纳人数:",""));
var intonba = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(4)");
Logger.Log(intonba.Text().Replace("加入NBA时间:",""));
var champion = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(5)");
Logger.Log(champion.Text().Replace("获总冠军次数:",""));
var coach = Default.SelectSingleNode("div.blockB>div.left>div.pt>ul>li:eq(6)");
Logger.Log(coach.Text().Replace("现任主教练:",""));
DataManager.AppendData("TEAM",
DataEntry.Create()
.Set("teamid", teamid+"")
.Set("teamname", teamname.Text())
.Set("teamurl", teamurl.Text().Replace("主场所在城市:",""))
.Set("gym",gym.Text().Replace("主体育馆:",""))
.Set("peoplenum", peoplenum.Text().Replace("可容纳人数:",""))
.Set("intonba", intonba.Text().Replace("加入NBA时间:",""))
.Set("champion", champion.Text().Replace("获总冠军次数:",""))
.Set("coach", coach.Text().Replace("现任主教练:",""))
);
Logger.Log(i.ToString());
var playelist = Default.SelectNodes("div.tab>table tr");
foreach(var player in playelist)
{
var num = player.SelectSingleNode("TD:eq(0)");
var a = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a");
var url = a.Attr("href");
var playerid = Regex.Match(url, @"\d+").Value;
var playerimageurl = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>div>a>img");
var playername = player.SelectSingleNode("TD:eq(1)>DIV:eq(0)>P:eq(0)>A");
var position = player.SelectSingleNode("TD:eq(2)>SPAN:eq(0)");
var height = player.SelectSingleNode("TD:eq(3)");
var weight = player.SelectSingleNode("TD:eq(4)");
var birth = player.SelectSingleNode("TD:eq(5)");
var college = player.SelectSingleNode("TD:eq(6)");
Logger.Log(playerimageurl.Text());
Logger.Log(playername.Text());
Logger.Log(position.Text());
Logger.Log(height.Text());
Logger.Log(weight.Text());
Logger.Log(birth.Text());
Logger.Log(college.Text());
Logger.Log(playerimageurl.Attr("src"));
Logger.Log(playerid);
DataManager.AppendData("player",
DataEntry.Create()
.Set("playerid", playerid)
.Set("teamid", teamid+"")
.Set("playername", playername.Text())
.Set("position", position.Text())
.Set("height",height.Text())
.Set("weight", weight.Text())
.Set("birth", birth.Text())
.Set("college", college.Text())
.Set("num", num.Text())
.Set("playerimageurl",playerimageurl.Attr("src"))
);
}
}
}

比赛信息: http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012

脚本如下:

public void Run()
{
Logger.ClearAll();
Default.Navigate("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-01&season_year=2012");
Default.Ready();
var games = Default.SelectNodes("div.tab tr>td.e17>span.bluetext>a:contains(\"技术统计\")"); List<string> urls = new List<string>();
foreach(var g in games)
{
var url = new Uri(new Uri("http://data.sports.sohu.com/nba/nba_schedule_by_month.php?m=2013-10&season_year=2013"), g.Attr("href")).ToString();
urls.Add(url.ToString());
}
foreach(var url in urls)
{
if( Default.Available == false) return;
Default.Navigate(url);
Default.Ready();
var teamNames = Default.SelectNodes("div.blockA>h2");
var scores = Default.SelectNodes("table.tab04 tr");
var scoreslist = Default.SelectNodes("table.tab02 tr>td");
var awayscores = Default.SelectNodes("table.tab02 tr");
var jiashiscores = Default.SelectSingleNode("table.tab03>TD:eq(0)");
var logos = Default.SelectNodes("td.logo img");
var awayid =Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
var homeid =Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value;
var homescore=scores[].Text();
var awayscore=scores[].Text();
var awayscore1=scoreslist[].Text();
var awayscore2=scoreslist[].Text();
var awayscore3=scoreslist[].Text();
var awayscore4=scoreslist[].Text();
var homescore1=scoreslist[].Text();
var homescore2=scoreslist[].Text();
var homescore3=scoreslist[].Text();
var homescore4=scoreslist[].Text();
var gametime = Default.SelectSingleNode("div.center>h2"); var jiashiawayscores1="";
var jiashiawayscores2="" ;
var jiashiawayscores3 ="";
var jiashiawayscores4="";
var jiashihomescores1="";
var jiashihomescores2="";
var jiashihomescores3 ="";
var jiashihomescores4=""; var td = Default.SelectSingleNode("table.tabBig td:contains(\"加时赛\")");
if(!td.IsEmpty())
{ if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
}
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text(); }
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text(); }
else if(Default.SelectSingleNode("div.more").Next().SelectNodes("th").Count==)
{
jiashiawayscores1 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(0)").Text();
jiashiawayscores2 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(1)").Text();
jiashiawayscores3 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(2)").Text();
jiashiawayscores4 = Default.SelectSingleNode("table.tab03 tr:eq(1)>TD:eq(3)").Text();
jiashihomescores1 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(0)").Text();
jiashihomescores2 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(1)").Text();
jiashihomescores3 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(2)").Text();
jiashihomescores4 = Default.SelectSingleNode("table.tab03 tr:eq(2)>TD:eq(3)").Text(); } } DataManager.AppendData("GAMESTATIC",
DataEntry.Create()
.Set("teamid", Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
.Set("gametime",gametime.Text().Replace("开始比赛",""))
.Set("score1",awayscore1)
.Set("score2", awayscore2)
.Set("score3",awayscore3)
.Set("score4",awayscore4)
.Set("score", scores[].Text())
.Set("gameid",url)
.Set("status", "")
.Set("jiashiscore1",jiashiawayscores1)
.Set("jiashiscore2",jiashiawayscores2)
.Set("jiashiscore3",jiashiawayscores3)
.Set("jiashiscore4",jiashiawayscores4)
);
DataManager.AppendData("GAMESTATIC",
DataEntry.Create()
.Set("teamid", Regex.Match(logos[].Attr("src"), @"(?<id>d+).jpg").Groups["id"].Value)
.Set("gametime",gametime.Text().Replace("开始比赛",""))
.Set("score1",homescore1)
.Set("score2", homescore2)
.Set("score3",homescore3)
.Set("score4",homescore4)
.Set("score", scores[].Text())
.Set("gameid",url)
.Set("status", "")
.Set("jiashiscore1",jiashihomescores1)
.Set("jiashiscore2",jiashihomescores2)
.Set("jiashiscore3",jiashihomescores3)
.Set("jiashiscore4",jiashihomescores4)
); } }

这里的亮点是要看48, 49两行, 这里对加时赛也进行了处理. 不是所有的比赛都有加时赛, 就算有也可以打多场(1-4场). 因此甜瓜非常细心的对这块也做了处理. 个人感觉这块代码也还是有优化的余地, 但是这种处理也非常简单直白, 一目了然, 也是很不错的.

最后运行起来:

文中开发工具Spider Studio (采集工作站)下载地址: http://www.gdtsearch.com/products.spiderstudio.htm. 安装后运行, 将脚本复制进去点"运行"即可看到效果.

Spider Studio QQ群: 45995410

示例 - C#脚本代码采集搜狐NBA球员, 球队和比赛实况的更多相关文章

  1. 使用CURL和火车头软件采集搜狐文章

    直接上代码: //参数1:访问的URL,参数2:post数据(不填则为GET),参数3:提交的$cookies,参数4:是否返回$cookies function curl_request($url, ...

  2. C# 脚本代码自动登录淘宝获取用户信息

    C# 脚本代码自动登录淘宝获取用户信息   最近遇到的一个需求是如何让程序自动登录淘宝, 获取用户名称等信息. 其实这个利用SS (SpiderStudio的简称) 实现起来非常简单. 十数行代码就可 ...

  3. crawler4j源码学习(1):搜狐新闻网新闻标题采集爬虫

    crawler4j是用Java实现的开源网络爬虫.提供了简单易用的接口,可以在几分钟内创建一个多线程网络爬虫.下面实例结合jsoup,采集搜狐新闻网(http://news.sohu.com/)新闻标 ...

  4. 利用朴素贝叶斯分类算法对搜狐新闻进行分类(python)

    数据来源  https://www.sogou.com/labs/resource/cs.php介绍:来自搜狐新闻2012年6月—7月期间国内,国际,体育,社会,娱乐等18个频道的新闻数据,提供URL ...

  5. jquery仿搜狐投票动画代码

    体验效果:http://hovertree.com/texiao/jquery/21/ 这是一款基于jquery实现的仿搜狐投票动画特效源码,运行该源码可见VS图标首先出现在中间位置,紧接着随着投票比 ...

  6. 【HTML&CSS】搜狐页面代码编写

    <!DOCTYPE html> <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"& ...

  7. 搜狐云景paas平台实践之路

    前言: 搜狐云景作为搜狐的paas平台,在2014年5月22日的云计算大会上正式发布了公测.初测,注册用户必须先申请邀请码参与公测会赠送用户100元电子券,经过实名认证之后会再赠送100电子券,目测可 ...

  8. 山寨Unity3D?搜狐畅游的免费开源游戏引擎Genesis-3D

    在CSDN上看到了<搜狐畅游发布3D游戏引擎Genesis-3D 基于MIT协议开源>(http://www.csdn.net/article/2013-11-21/2817585-cha ...

  9. 利用jieba,word2vec,LR进行搜狐新闻文本分类

    一.简介 1)jieba 中文叫做结巴,是一款中文分词工具,https://github.com/fxsjy/jieba 2)word2vec 单词向量化工具,https://radimrehurek ...

随机推荐

  1. vi命令整理

    vi命令整理 u 撤销上一次操作 ctrl+r 恢复上一次操作 : 跳转至第1行 :$ 跳转至最后一行 ctrl+f 向文章末尾翻页 ctrl+b 向文章开始翻页 yy 复制一行 p 粘贴刚刚复制第一 ...

  2. Token_使用JWT生成token

    1.token三部分 header   { "typ": "JWT", "alg": "HS256"   } paylo ...

  3. 在低带宽或不可靠的网络环境中安装 Visual Studio 2017

    在低带宽或不可靠的网络环境中安装 Visual Studio 2017 2017-4-141 分钟阅读时长 作者  https://docs.microsoft.com/zh-cn/visualstu ...

  4. Hibernate_8_Person和IdCard实例_一对一关系:基于外键

    1)建立Person类: public class Person { private Integer id; private String name; private IdCard IdCard; p ...

  5. (队列的应用5.3.3)POJ 3125 Printer Queue(优先队列的使用)

    /* * POJ_3125.cpp * * Created on: 2013年10月31日 * Author: Administrator */ #include <iostream> # ...

  6. ASP.NET操作Oracle知识记录(采用ODP.NET)

    最近因为要把以前做的一个项目数据库从MSSQL2005转移到Oracle上,所以开始接触Oracle,通过本篇随笔简单记录一些ASP.NET结合Oralce的操作: 因为微软未来不再支持 System ...

  7. Android 再按一次退出应用的代码

    private long exitTime = 0; @Override public boolean onKeyDown(int keyCode, KeyEvent event) { if (key ...

  8. 在 HTML 中使用JavaScript

    <script>元素 属性     async:可选.async 属性规定一旦脚本可用,则会异步执行,表示应该立即下载脚本,但不妨碍页面中的其他操作,比如下载其他资源或等待加载其他脚本.a ...

  9. MySQL5.6 主从复制 ERROR 1776 (HY000): Parameters MASTER_LOG_FILE

    主从都开启了gtid,在设置从库的时候遇到了问题 mysql> CHANGE MASTER TO MASTER_HOST=‘xxx’,MASTER_USER='replicant',MASTER ...

  10. 最大似然估计(MLE)与最大后验概率(MAP)

    何为:最大似然估计(MLE): 最大似然估计提供了一种给定观察数据来评估模型参数的方法,即:“模型已定,参数未知”.可以通过采样,获取部分数据,然后通过最大似然估计来获取已知模型的参数. 最大似然估计 ...