Intern---Microsoft Academic China Team
项目二:
AEther:
项目 一、项目需求:对搜索关键词进行类别的统计分析,为了后面的entity-rank做准备。
0,各种关键数据统计:
数据量:1个月数据:about 1000T。
1,对IE的所有浏览搜索的提取代码:
Scope:
//Script GUID:ad2766d3-7aec-4ffa-9bbd-ec2740361999 //Used for tracking history REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll"; RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll"; UnifiedViewRaw = VIEW "/shares/searchDM/distrib/released/CompetitiveUnifiedView/CompetitiveUnifiedPageView.view" PARAMS ( Start = @"2016-07-26", End = @"2016-07-26", Source = @"All" // Source = @"DesktopIE" ); ClickData = SELECT Page_FromPage.Query, RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Request_Url)) AS NormalizedUrl, COUNT() AS Count FROM UnifiedViewRaw WHERE Page_FromPage.IsQuery == true AND Page_FromPage.Vertical.ToLower() == "web" AND NOT Request_IsQuery AND Page_FromPage.Market.ToLower() == "zh-cn" HAVING Count >= AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null; // Page_FromPage.IsQuery: True if the page is a query page // Vertical: Search Vertical of this PageView // Request_IsQuery bool: True if this page view is search engine result page OUTPUT TO @"/my/entityranker/oneDay/IEqueryclickurlpairsAll.wsv";
C#:
using System; using System.Collections.Generic; using System.IO; using System.Text; using ScopeRuntime; public class CMyUtils { static public string NormalizeURL(string url) { url = url.ToLower(); if (url.StartsWith("http://")) { url = url.Substring("http://".Length); } else if (url.StartsWith("https://")) { url = url.Substring("https://".Length); } if (url.StartsWith("www.")) { url = url.Substring("www.".Length); } if (url.EndsWith("/")) { url = url.Substring(, url.Length - ); } return url; } static public string GetHost(string url) { url = NormalizeURL(url); int slashPosition = url.IndexOf('/'); ) { url = url.Substring(, slashPosition); } return url; } } public class TopReducer : Reducer { public override Schema Produces(string[] columns, string[] args, Schema input) { return input.Clone(); } public override IEnumerable<Row> Reduce(RowSet input, Row output, string[] args) { ; foreach (Row row in input.Rows) { ) { row.Copy(output); yield return output; } } } }
2,对bing的所有浏览搜索的提取代码:
Scope:
//Script GUID:8e9ba2e3-8288-49e1-a5ff-7776d653ae16 //Used for tracking history REFERENCE "/local/IndexQualityCJK/wb/WordBreaker.dll"; RESOURCE "/local/IndexQualityCJK/wb/unzip.exe"; RESOURCE "/local/IndexQualityCJK/wb/wordbreak.zip"; REFERENCE @"/shares/searchDM/distrib/released/SLAPI/SearchLogApi.dll"; REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Serializer.exe"; REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Microsoft.Live.Json.dll"; REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll"; RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll"; USING MS.Internal.Bing.DataMining.SearchLogApi; USING Microsoft.Live.Json; SlapiPageView = VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogPageView.view" //VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogSessionView.view" //PARAMS (Start = @"2013-09-01", End = @"2013-09-01", UseSample=false, Dataset="Mobile"); PARAMS (Start = "2016-07-26", End = "2016-07-26", UseSample=false, Dataset= "Mobile"); ZHCNTraffic = SELECT WordBreaker.BreakWords(Query_RawQuery, "zh-cn") AS Query, Request_RequestTime.ToString("yyyy-MM-dd") AS QDate, Page_Entities_WebResults FROM SlapiPageView WHERE Request_IsBotVNext == false AND Request_IsMarketingTraffic == false AND string.IsNullOrEmpty(Market) == false AND Market.ToLower() == "zh-cn" AND string.IsNullOrEmpty(Vertical) == false AND Vertical.ToLower() == "web"; //AND string.IsNullOrEmpty(FormCode) == false AND (FormCode.ToUpper() == "QBLH" OR FormCode.ToUpper() == "QBRE" OR FormCode.ToUpper() == "MSNBHP" OR FormCode.ToUpper() == "MSNFLH" OR FormCode.ToUpper() == "BCNASI"); ProcessWebEntity = PROCESS ZHCNTraffic USING FEXLogSimpleExtractor; ClickQueryUrlPairs = SELECT Query, RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Url)) AS NormalizedUrl, COUNT() AS PairCount FROM ProcessWebEntity WHERE Click > HAVING PairCount >= AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null; OUTPUT TO @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv";
C#:
using System; using System.Collections.Generic; using System.IO; using System.Text; using ScopeRuntime; using MS.Internal.Bing.DataMining.SearchLogApi; public class URLUtility { static public string NormalizeURL(string url) { url = url.ToLower(); if (url.StartsWith("http://")) { url = url.Substring("http://".Length); } else if (url.StartsWith("https://")) { url = url.Substring("https://".Length); } if (url.StartsWith("www.")) { url = url.Substring("www.".Length); } if (url.EndsWith("/")) { url = url.Substring(, url.Length - ); } return url; } static public string GetHost(string url) { url = NormalizeURL(url); int slashPosition = url.IndexOf('/'); ) { url = url.Substring(, slashPosition); } return url; } } public class FEXLogSimpleExtractor : Processor { public override Schema Produces(string[] columns, string[] args, Schema input) { return new Schema("Query:string,QueryDate:string,Url:string,Host:string,POS:int,Click:int"); } public override IEnumerable<Row> Process(RowSet input, Row output, string[] args) { foreach (Row row in input.Rows) { string Query = row["Query"].String; string QueryDate = row["QDate"].String; var WebEntities = row["Page_Entities_WebResults"].Value as MS.Internal.Bing.DataMining.SearchLogApi.WebResultList; ; i < WebEntities.Count;i++ ) { string Url = WebEntities[i].TitleUrl; string Host = URLUtility.GetHost(Url); int Pos = WebEntities[i].PositionOfEntityInTopLevelRegion; int Click = WebEntities[i].Clicks.Count; ) { output["Query"].Set(Query); output["QueryDate"].Set(QueryDate); output["Url"].Set(Url); output["Host"].Set(Host); output["POS"].Set(Pos); output["Click"].Set(Click); yield return output; } } } } }
3,搜索查询和分类的提取代码:
//Script GUID:5b7abb8b-defd-4f2b-b703-9882ee6e960b //Used for tracking history REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/RetroIndexProcessor.dll"; REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll"; RESOURCE @"/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll"; USING RetroIndex; Snapshot = VIEW "/shares/searchWebLoad/RetroIndex/Views/LatestSnapshot.view" PARAMS ( Sample = false, TierFlag = ); SELECT Url, Header, Body, HttpHeader, CodePage FROM Snapshot; Uberchunk = PROCESS PRODUCE Url, Country, Language, Category USING RetroIndexProcessor HAVING string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs"; OUTPUT TO @"/local/IndexCJK/MobileTopSites/UrlCategory.wsv";
C#:
using System; using System.Collections.Generic; using System.IO; using System.Text; using ScopeRuntime; public class Utility { public static bool CJKVersionMobileFriendly(string category) { if (string.IsNullOrEmpty(category)) { return false; } string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries); foreach (string cate in cates) { if(cate.StartsWith("aa00") && (cate.EndsWith("Mobi") || cate.EndsWith("CrossDevice") || cate.EndsWith("MobileFriendly") || cate.EndsWith("MobileUnFriendly"))) { return true; } } return false; } public static bool CJKVersionMobileUnFriendly(string category) { if (string.IsNullOrEmpty(category)) { return false; } string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries); foreach (string cate in cates) { if (cate.StartsWith("aa00") && cate.EndsWith("MobileUnFriendly")) { return true; } } return false; } } public class CJKVersionMobileOkClassifierProcessor : Processor { public override Schema Produces(string[] columns, string[] args, Schema input) { return new Schema("Url:string, MobileClassifier:int"); } public override IEnumerable<Row> Process(RowSet input, Row output, string[] args) { foreach (Row row in input.Rows) { string Url = row["Url"].String; string Language = row["Language"].String; if (!(string.IsNullOrEmpty(Url) == false && string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs")) { continue; } //classifier features string Category = row["Category"].String; string DUPipeline_MobileUrls = row["DUPipeline_MobileUrls"].String; string DUPipeline_ResponsiveDesignSpans = row["DUPipeline_ResponsiveDesignSpans"].String; string DUV2_MobileUrl = row["DUV2_MobileUrl"].String; string InjHdr_MobileOkClassifier_v1 = row["InjHdr_MobileOkClassifier_v1"].String; string InjHdr_MobileOkX_v1 = row["InjHdr_MobileOkX_v1"].String; string InjHdr_MobileRedirect_V1 = row["InjHdr_MobileRedirect_V1"].String; string SpamJunkRuleID = row["SpamJunkRuleID"].String; string MobileOkClassifier = string.IsNullOrEmpty(InjHdr_MobileOkClassifier_v1) ? InjHdr_MobileOkX_v1 : InjHdr_MobileOkClassifier_v1; ; if (string.IsNullOrEmpty(DUPipeline_MobileUrls) == false || string.IsNullOrEmpty(DUPipeline_ResponsiveDesignSpans) == false || string.IsNullOrEmpty(DUV2_MobileUrl) == false || string.IsNullOrEmpty(InjHdr_MobileRedirect_V1) == false || Utility.CJKVersionMobileFriendly(Category) || (")) { MobileClassifier = ; } ")) { MobileClassifier = ; } ") { MobileClassifier = ; } output["Url"].Set(Url); output["MobileClassifier"].Set(MobileClassifier); yield return output; } } }
4,对IE和bing进行union,然后对相同的query进行合并。
Scope
//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a //Used for tracking history ie = EXTRACT Query : string, Url : string, Count : int FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv" USING DefaultTextExtractor(); bing = EXTRACT Query : string, Url : string, Count : int FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv" USING DefaultTextExtractor(); union_all = SELECT * FROM ie UNION ALL SELECT * FROM bing; result = SELECT Query, Url, SUM(Count) AS NewCount FROM union_all ORDER BY Query; OUTPUT result TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";
5,得到Query, Category,ClickCount的对应。
//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a //Used for tracking history ie = EXTRACT Query : string, Url : string, Count : int FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv" USING DefaultTextExtractor(); bing = EXTRACT Query : string, Url : string, Count : int FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv" USING DefaultTextExtractor(); union_all = SELECT * FROM ie UNION ALL SELECT * FROM bing; result = SELECT Query, Url, SUM(Count) AS NewCount FROM union_all ORDER BY Query; OUTPUT result TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";
6,得到了query->Category之后,要算那个category出现的最多。
每一条出现的地方*clickCount然后累加起来。
这里用到了reduce来做。
7,算分数。
Intern---Microsoft Academic China Team的更多相关文章
- Announcing Microsoft Research Open Data – Datasets by Microsoft Research now available in the cloud
The Microsoft Research Outreach team has worked extensively with the external research community to ...
- Team Foundation 中的错误和事件消息
Visual Studio Team System Team Foundation 中的错误和事件消息 Team Foundation 通过显示错误消息和事件消息来通知您操作成功以及操作失败.一部分错 ...
- Microsoft Dynamics CRM 分销行业解决方案
Microsoft Dynamics CRM 分销行业解决方案 方案亮点 360度动态渠道信息管理 充分的客户细分 全面的业务代表考核指标 业务代表管理和能力建设 业务代表过程管理 业务代表费用管理 ...
- Azure China (4) 管理Azure China Storage Account
<Windows Azure Platform 系列文章目录> Update 2015-05-10 强烈建议使用AzCopy工具,AzCopy命令行工具,是经过优化的.高性能Azure S ...
- Azure China (1) Azure公有云落地中国
<Windows Azure Platform 系列文章目录> 微软公有云Microsoft Azure已经落地中国,官方网址:http://www.windowsazure.cn/. 在 ...
- Microsoft server software support for Microsoft Azure virtual machines
http://support.microsoft.com/kb/2721672/en-us Article ID: 2721672 - Last Review: November 22, 2014 ...
- 如何访问Microsoft Azure Storage
首先先要创建存储账户 http://www.cnblogs.com/SignalTips/p/4119128.html 可以通过以下的几个方式访问 通过Visual Studio 2013 Commu ...
- Microsoft TFS 如何显示在Windows 的上下文菜单中
How to showing in Windows Explorer context for TFS I am not sure if this would help or you are willi ...
- Microsoft .NET Pet Shop 简介
最初研究 .NET Pet Shop 的目的是用 Microsoft .NET 实现 Sun 主要的 J2EE 蓝图应用程序 Sun Java Pet Store 同样的应用程序功能. 根据用 .NE ...
随机推荐
- C#进阶系列——WebApi 路由机制剖析:你准备好了吗?
前言:从MVC到WebApi,路由机制一直是伴随着这些技术的一个重要组成部分. 它可以很简单:如果你仅仅只需要会用一些简单的路由,如/Home/Index,那么你只需要配置一个默认路由就能简单搞定: ...
- ElasticSearch+Kibana 索引操作( 附源码)
一 前言 ElasticiSearch 简介 ElasticSearch是一个基于Lucene的搜索服务器.它提供了一个分布式多用户能力的全文搜索引擎,基于RESTful web接口.Elastics ...
- @RenderSection,@RenderPage,@RenderBody介绍
在MVC的模板页中会用到上面三个东西,那么今天就简单归纳下各有什么作用 1.@RenderSection 用法 对CSS或JS部分模块的预留定义 例如模板页定义了@RenderSection(&quo ...
- 协程--gevent模块(单线程高并发)
先恶补一下知识点,上节回顾 上下文切换:当CPU从执行一个线程切换到执行另外一个线程的时候,它需要先存储当前线程的本地的数据,程序指针等,然后载入另一个线程的本地数据,程序指针等,最后才开始执行.这种 ...
- Android必学——AsyncTask
第一章 AsyncTask的基本构成 为是么要异步任务 1)Android单线程模型 2)耗时操作放在非主线程中执行 AsyncTask为何而生 1)子线程中跟新UI 2)封装.简化异步操作 pub ...
- bzoj1078【SCOI2008】斜堆
题意: 斜堆(skew heap)是一种常用的数据结构.它也是二叉树,且满足与二叉堆相同的堆性质:每个非根结点的值都比它父亲大.因此在整棵斜堆中,根的值最小.但斜堆不必是平衡的,每个结点的左右儿子的大 ...
- redis并发问题
redis中的并发问题 使用redis作为缓存已经很久了,redis是以单进程的形式运行的,命令是一个接着一个执行的,一直以为不会存在并发的问题,直到今天看到相关的资料,才恍然大悟~~ 具体问题实例 ...
- windows常用快捷键
windows常用快捷键 Fn+F1:Windows的支持和帮助 =Windows 徽标+F1 Fn+F2:重命名对象 Fn+F3:查找所有文件 Fn+F4:选择“转到不同的文件夹”框并沿框中的项向下 ...
- fsr
Front-end server render 前端在后端的渲染 1.采用express框架创建项目 express -e fsr cd fsr npm install 2.模板选用artTempla ...
- 为不同分辨率单独做样式文件,在页面头部用js判断分辨率后动态加载定义好的样式文件
为不同分辨率单独做样式文件,在页面头部用js判断分辨率后动态加载定义好的样式文件.样式文件命名格式如:forms[_屏幕宽度].css,样式文件中只需重新定义文本框和下拉框的宽度即可. 在包含的头文件 ...