Intern---Microsoft Academic China Team
项目二:
AEther:
项目 一、项目需求:对搜索关键词进行类别的统计分析,为了后面的entity-rank做准备。
0,各种关键数据统计:
数据量:1个月数据:about 1000T。
1,对IE的所有浏览搜索的提取代码:
Scope:
//Script GUID:ad2766d3-7aec-4ffa-9bbd-ec2740361999 //Used for tracking history REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll"; RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll"; UnifiedViewRaw = VIEW "/shares/searchDM/distrib/released/CompetitiveUnifiedView/CompetitiveUnifiedPageView.view" PARAMS ( Start = @"2016-07-26", End = @"2016-07-26", Source = @"All" // Source = @"DesktopIE" ); ClickData = SELECT Page_FromPage.Query, RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Request_Url)) AS NormalizedUrl, COUNT() AS Count FROM UnifiedViewRaw WHERE Page_FromPage.IsQuery == true AND Page_FromPage.Vertical.ToLower() == "web" AND NOT Request_IsQuery AND Page_FromPage.Market.ToLower() == "zh-cn" HAVING Count >= AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null; // Page_FromPage.IsQuery: True if the page is a query page // Vertical: Search Vertical of this PageView // Request_IsQuery bool: True if this page view is search engine result page OUTPUT TO @"/my/entityranker/oneDay/IEqueryclickurlpairsAll.wsv";
C#:
using System; using System.Collections.Generic; using System.IO; using System.Text; using ScopeRuntime; public class CMyUtils { static public string NormalizeURL(string url) { url = url.ToLower(); if (url.StartsWith("http://")) { url = url.Substring("http://".Length); } else if (url.StartsWith("https://")) { url = url.Substring("https://".Length); } if (url.StartsWith("www.")) { url = url.Substring("www.".Length); } if (url.EndsWith("/")) { url = url.Substring(, url.Length - ); } return url; } static public string GetHost(string url) { url = NormalizeURL(url); int slashPosition = url.IndexOf('/'); ) { url = url.Substring(, slashPosition); } return url; } } public class TopReducer : Reducer { public override Schema Produces(string[] columns, string[] args, Schema input) { return input.Clone(); } public override IEnumerable<Row> Reduce(RowSet input, Row output, string[] args) { ; foreach (Row row in input.Rows) { ) { row.Copy(output); yield return output; } } } }
2,对bing的所有浏览搜索的提取代码:
Scope:
//Script GUID:8e9ba2e3-8288-49e1-a5ff-7776d653ae16 //Used for tracking history REFERENCE "/local/IndexQualityCJK/wb/WordBreaker.dll"; RESOURCE "/local/IndexQualityCJK/wb/unzip.exe"; RESOURCE "/local/IndexQualityCJK/wb/wordbreak.zip"; REFERENCE @"/shares/searchDM/distrib/released/SLAPI/SearchLogApi.dll"; REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Serializer.exe"; REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Microsoft.Live.Json.dll"; REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll"; RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll"; USING MS.Internal.Bing.DataMining.SearchLogApi; USING Microsoft.Live.Json; SlapiPageView = VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogPageView.view" //VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogSessionView.view" //PARAMS (Start = @"2013-09-01", End = @"2013-09-01", UseSample=false, Dataset="Mobile"); PARAMS (Start = "2016-07-26", End = "2016-07-26", UseSample=false, Dataset= "Mobile"); ZHCNTraffic = SELECT WordBreaker.BreakWords(Query_RawQuery, "zh-cn") AS Query, Request_RequestTime.ToString("yyyy-MM-dd") AS QDate, Page_Entities_WebResults FROM SlapiPageView WHERE Request_IsBotVNext == false AND Request_IsMarketingTraffic == false AND string.IsNullOrEmpty(Market) == false AND Market.ToLower() == "zh-cn" AND string.IsNullOrEmpty(Vertical) == false AND Vertical.ToLower() == "web"; //AND string.IsNullOrEmpty(FormCode) == false AND (FormCode.ToUpper() == "QBLH" OR FormCode.ToUpper() == "QBRE" OR FormCode.ToUpper() == "MSNBHP" OR FormCode.ToUpper() == "MSNFLH" OR FormCode.ToUpper() == "BCNASI"); ProcessWebEntity = PROCESS ZHCNTraffic USING FEXLogSimpleExtractor; ClickQueryUrlPairs = SELECT Query, RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Url)) AS NormalizedUrl, COUNT() AS PairCount FROM ProcessWebEntity WHERE Click > HAVING PairCount >= AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null; OUTPUT TO @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv";
C#:
using System; using System.Collections.Generic; using System.IO; using System.Text; using ScopeRuntime; using MS.Internal.Bing.DataMining.SearchLogApi; public class URLUtility { static public string NormalizeURL(string url) { url = url.ToLower(); if (url.StartsWith("http://")) { url = url.Substring("http://".Length); } else if (url.StartsWith("https://")) { url = url.Substring("https://".Length); } if (url.StartsWith("www.")) { url = url.Substring("www.".Length); } if (url.EndsWith("/")) { url = url.Substring(, url.Length - ); } return url; } static public string GetHost(string url) { url = NormalizeURL(url); int slashPosition = url.IndexOf('/'); ) { url = url.Substring(, slashPosition); } return url; } } public class FEXLogSimpleExtractor : Processor { public override Schema Produces(string[] columns, string[] args, Schema input) { return new Schema("Query:string,QueryDate:string,Url:string,Host:string,POS:int,Click:int"); } public override IEnumerable<Row> Process(RowSet input, Row output, string[] args) { foreach (Row row in input.Rows) { string Query = row["Query"].String; string QueryDate = row["QDate"].String; var WebEntities = row["Page_Entities_WebResults"].Value as MS.Internal.Bing.DataMining.SearchLogApi.WebResultList; ; i < WebEntities.Count;i++ ) { string Url = WebEntities[i].TitleUrl; string Host = URLUtility.GetHost(Url); int Pos = WebEntities[i].PositionOfEntityInTopLevelRegion; int Click = WebEntities[i].Clicks.Count; ) { output["Query"].Set(Query); output["QueryDate"].Set(QueryDate); output["Url"].Set(Url); output["Host"].Set(Host); output["POS"].Set(Pos); output["Click"].Set(Click); yield return output; } } } } }
3,搜索查询和分类的提取代码:
//Script GUID:5b7abb8b-defd-4f2b-b703-9882ee6e960b //Used for tracking history REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/RetroIndexProcessor.dll"; REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll"; RESOURCE @"/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll"; USING RetroIndex; Snapshot = VIEW "/shares/searchWebLoad/RetroIndex/Views/LatestSnapshot.view" PARAMS ( Sample = false, TierFlag = ); SELECT Url, Header, Body, HttpHeader, CodePage FROM Snapshot; Uberchunk = PROCESS PRODUCE Url, Country, Language, Category USING RetroIndexProcessor HAVING string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs"; OUTPUT TO @"/local/IndexCJK/MobileTopSites/UrlCategory.wsv";
C#:
using System; using System.Collections.Generic; using System.IO; using System.Text; using ScopeRuntime; public class Utility { public static bool CJKVersionMobileFriendly(string category) { if (string.IsNullOrEmpty(category)) { return false; } string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries); foreach (string cate in cates) { if(cate.StartsWith("aa00") && (cate.EndsWith("Mobi") || cate.EndsWith("CrossDevice") || cate.EndsWith("MobileFriendly") || cate.EndsWith("MobileUnFriendly"))) { return true; } } return false; } public static bool CJKVersionMobileUnFriendly(string category) { if (string.IsNullOrEmpty(category)) { return false; } string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries); foreach (string cate in cates) { if (cate.StartsWith("aa00") && cate.EndsWith("MobileUnFriendly")) { return true; } } return false; } } public class CJKVersionMobileOkClassifierProcessor : Processor { public override Schema Produces(string[] columns, string[] args, Schema input) { return new Schema("Url:string, MobileClassifier:int"); } public override IEnumerable<Row> Process(RowSet input, Row output, string[] args) { foreach (Row row in input.Rows) { string Url = row["Url"].String; string Language = row["Language"].String; if (!(string.IsNullOrEmpty(Url) == false && string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs")) { continue; } //classifier features string Category = row["Category"].String; string DUPipeline_MobileUrls = row["DUPipeline_MobileUrls"].String; string DUPipeline_ResponsiveDesignSpans = row["DUPipeline_ResponsiveDesignSpans"].String; string DUV2_MobileUrl = row["DUV2_MobileUrl"].String; string InjHdr_MobileOkClassifier_v1 = row["InjHdr_MobileOkClassifier_v1"].String; string InjHdr_MobileOkX_v1 = row["InjHdr_MobileOkX_v1"].String; string InjHdr_MobileRedirect_V1 = row["InjHdr_MobileRedirect_V1"].String; string SpamJunkRuleID = row["SpamJunkRuleID"].String; string MobileOkClassifier = string.IsNullOrEmpty(InjHdr_MobileOkClassifier_v1) ? InjHdr_MobileOkX_v1 : InjHdr_MobileOkClassifier_v1; ; if (string.IsNullOrEmpty(DUPipeline_MobileUrls) == false || string.IsNullOrEmpty(DUPipeline_ResponsiveDesignSpans) == false || string.IsNullOrEmpty(DUV2_MobileUrl) == false || string.IsNullOrEmpty(InjHdr_MobileRedirect_V1) == false || Utility.CJKVersionMobileFriendly(Category) || (")) { MobileClassifier = ; } ")) { MobileClassifier = ; } ") { MobileClassifier = ; } output["Url"].Set(Url); output["MobileClassifier"].Set(MobileClassifier); yield return output; } } }
4,对IE和bing进行union,然后对相同的query进行合并。
Scope
//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a //Used for tracking history ie = EXTRACT Query : string, Url : string, Count : int FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv" USING DefaultTextExtractor(); bing = EXTRACT Query : string, Url : string, Count : int FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv" USING DefaultTextExtractor(); union_all = SELECT * FROM ie UNION ALL SELECT * FROM bing; result = SELECT Query, Url, SUM(Count) AS NewCount FROM union_all ORDER BY Query; OUTPUT result TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";
5,得到Query, Category,ClickCount的对应。
//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a //Used for tracking history ie = EXTRACT Query : string, Url : string, Count : int FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv" USING DefaultTextExtractor(); bing = EXTRACT Query : string, Url : string, Count : int FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv" USING DefaultTextExtractor(); union_all = SELECT * FROM ie UNION ALL SELECT * FROM bing; result = SELECT Query, Url, SUM(Count) AS NewCount FROM union_all ORDER BY Query; OUTPUT result TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";
6,得到了query->Category之后,要算那个category出现的最多。
每一条出现的地方*clickCount然后累加起来。
这里用到了reduce来做。
7,算分数。
Intern---Microsoft Academic China Team的更多相关文章
- Announcing Microsoft Research Open Data – Datasets by Microsoft Research now available in the cloud
The Microsoft Research Outreach team has worked extensively with the external research community to ...
- Team Foundation 中的错误和事件消息
Visual Studio Team System Team Foundation 中的错误和事件消息 Team Foundation 通过显示错误消息和事件消息来通知您操作成功以及操作失败.一部分错 ...
- Microsoft Dynamics CRM 分销行业解决方案
Microsoft Dynamics CRM 分销行业解决方案 方案亮点 360度动态渠道信息管理 充分的客户细分 全面的业务代表考核指标 业务代表管理和能力建设 业务代表过程管理 业务代表费用管理 ...
- Azure China (4) 管理Azure China Storage Account
<Windows Azure Platform 系列文章目录> Update 2015-05-10 强烈建议使用AzCopy工具,AzCopy命令行工具,是经过优化的.高性能Azure S ...
- Azure China (1) Azure公有云落地中国
<Windows Azure Platform 系列文章目录> 微软公有云Microsoft Azure已经落地中国,官方网址:http://www.windowsazure.cn/. 在 ...
- Microsoft server software support for Microsoft Azure virtual machines
http://support.microsoft.com/kb/2721672/en-us Article ID: 2721672 - Last Review: November 22, 2014 ...
- 如何访问Microsoft Azure Storage
首先先要创建存储账户 http://www.cnblogs.com/SignalTips/p/4119128.html 可以通过以下的几个方式访问 通过Visual Studio 2013 Commu ...
- Microsoft TFS 如何显示在Windows 的上下文菜单中
How to showing in Windows Explorer context for TFS I am not sure if this would help or you are willi ...
- Microsoft .NET Pet Shop 简介
最初研究 .NET Pet Shop 的目的是用 Microsoft .NET 实现 Sun 主要的 J2EE 蓝图应用程序 Sun Java Pet Store 同样的应用程序功能. 根据用 .NE ...
随机推荐
- 大三那年在某宝8块钱买的.NET视频决定了我的职业生涯
前言 谨以此文献给那些还在大学中迷茫的莘莘学子们! 韩愈在<师说>中提出了作为师者应该做的三件事:传道.授业.解惑. 1.传道:培养学生的道德观 2.授业:传授学生专业技能 3.解惑:解答 ...
- Android---观察者模式的简单实现demo
ObserverListerner: subjectListener: 观察者管理类: 使用方法: 1. 接口: 2. 注册观察者: 3. 通知:(触发事件执行): 4. 实现方法:(都要写, 只在要 ...
- mysql Packet for query is too large (1185 > 1024)异常
注:最近mysql一直提示如下错误 Packet for query is too large (1185 > 1024). You can change this value on the s ...
- spring架构源码:
p.p1 { margin: 0.0px 0.0px 0.0px 0.0px; font: 12.0px "Helvetica Neue"; color: #454545 } p. ...
- C# Winform防止一个程序重复运行
1: //在写一些服务型的软件的时候,你可能不希望一个操作系统里有两个副本在运行,这样也许会扰乱你的操作.这时,你就需要限制程序的副本.下面的这个方法,很简单的就可以实现上述功能. using Sys ...
- Python环境配置安装
2016年12月20日14:15:23 -------------- 参考菜鸟教程: Python 环境搭建 | 菜鸟教程 http://www.runoob.com/python/python-i ...
- C#之索引器
实际中不使用这个东西,只做了解 using System; using System.Collections.Generic; using System.Linq; using System.Text ...
- 安装PHP sphinx扩展 sphinx-1.1.0/sphinx.c:105:2: error: too few arguments 错误
表现: /home/sphinx-1.1.0/sphinx.c: In function 'php_sphinx_client_read_property':/home/sphinx-1.1.0/sp ...
- MySql 小记
MySql 简单 小记 以备查看 1.sql概述 1.什么是sql? 2.sql发展过程? 3.sql标准与方言的关系? 4.常用数据库? 5.MySql数据库安装? 2.关键概念 表结构----- ...
- MyBatis Cache配置
@(MyBatis)[Cache] MyBatis Cache配置 MyBatis提供了一级缓存和二级缓存 配置 全局配置 配置 说明 默认值 可选值 cacheEnabled 全局缓存的开关 tru ...