Intern---Microsoft Academic China Team
项目二:
AEther:
项目 一、项目需求:对搜索关键词进行类别的统计分析,为了后面的entity-rank做准备。
0,各种关键数据统计:
数据量:1个月数据:about 1000T。
1,对IE的所有浏览搜索的提取代码:
Scope:
//Script GUID:ad2766d3-7aec-4ffa-9bbd-ec2740361999
//Used for tracking history
REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";
UnifiedViewRaw =
VIEW "/shares/searchDM/distrib/released/CompetitiveUnifiedView/CompetitiveUnifiedPageView.view"
PARAMS
(
Start = @"2016-07-26",
End = @"2016-07-26",
Source = @"All"
// Source = @"DesktopIE"
);
ClickData =
SELECT Page_FromPage.Query,
RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Request_Url)) AS NormalizedUrl,
COUNT() AS Count
FROM UnifiedViewRaw
WHERE Page_FromPage.IsQuery == true AND Page_FromPage.Vertical.ToLower() == "web" AND NOT Request_IsQuery AND Page_FromPage.Market.ToLower() == "zh-cn"
HAVING Count >= AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null;
// Page_FromPage.IsQuery: True if the page is a query page
// Vertical: Search Vertical of this PageView
// Request_IsQuery bool: True if this page view is search engine result page
OUTPUT
TO @"/my/entityranker/oneDay/IEqueryclickurlpairsAll.wsv";
C#:
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
public class CMyUtils
{
static public string NormalizeURL(string url)
{
url = url.ToLower();
if (url.StartsWith("http://"))
{
url = url.Substring("http://".Length);
}
else if (url.StartsWith("https://"))
{
url = url.Substring("https://".Length);
}
if (url.StartsWith("www."))
{
url = url.Substring("www.".Length);
}
if (url.EndsWith("/"))
{
url = url.Substring(, url.Length - );
}
return url;
}
static public string GetHost(string url)
{
url = NormalizeURL(url);
int slashPosition = url.IndexOf('/');
)
{
url = url.Substring(, slashPosition);
}
return url;
}
}
public class TopReducer : Reducer
{
public override Schema Produces(string[] columns, string[] args, Schema input)
{
return input.Clone();
}
public override IEnumerable<Row> Reduce(RowSet input, Row output, string[] args)
{
;
foreach (Row row in input.Rows)
{
)
{
row.Copy(output);
yield return output;
}
}
}
}
2,对bing的所有浏览搜索的提取代码:
Scope:
//Script GUID:8e9ba2e3-8288-49e1-a5ff-7776d653ae16
//Used for tracking history
REFERENCE "/local/IndexQualityCJK/wb/WordBreaker.dll";
RESOURCE "/local/IndexQualityCJK/wb/unzip.exe";
RESOURCE "/local/IndexQualityCJK/wb/wordbreak.zip";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/SearchLogApi.dll";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Serializer.exe";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Microsoft.Live.Json.dll";
REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";
USING MS.Internal.Bing.DataMining.SearchLogApi;
USING Microsoft.Live.Json;
SlapiPageView =
VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogPageView.view"
//VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogSessionView.view"
//PARAMS (Start = @"2013-09-01", End = @"2013-09-01", UseSample=false, Dataset="Mobile");
PARAMS (Start = "2016-07-26", End = "2016-07-26", UseSample=false, Dataset= "Mobile");
ZHCNTraffic =
SELECT WordBreaker.BreakWords(Query_RawQuery, "zh-cn") AS Query,
Request_RequestTime.ToString("yyyy-MM-dd") AS QDate,
Page_Entities_WebResults
FROM SlapiPageView
WHERE Request_IsBotVNext == false AND Request_IsMarketingTraffic == false
AND string.IsNullOrEmpty(Market) == false AND Market.ToLower() == "zh-cn"
AND string.IsNullOrEmpty(Vertical) == false AND Vertical.ToLower() == "web";
//AND string.IsNullOrEmpty(FormCode) == false AND (FormCode.ToUpper() == "QBLH" OR FormCode.ToUpper() == "QBRE" OR FormCode.ToUpper() == "MSNBHP" OR FormCode.ToUpper() == "MSNFLH" OR FormCode.ToUpper() == "BCNASI");
ProcessWebEntity =
PROCESS ZHCNTraffic
USING FEXLogSimpleExtractor;
ClickQueryUrlPairs =
SELECT Query,
RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Url)) AS NormalizedUrl,
COUNT() AS PairCount
FROM ProcessWebEntity
WHERE Click > HAVING PairCount >= AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null;
OUTPUT
TO @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv";
C#:
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
using MS.Internal.Bing.DataMining.SearchLogApi;
public class URLUtility
{
static public string NormalizeURL(string url)
{
url = url.ToLower();
if (url.StartsWith("http://"))
{
url = url.Substring("http://".Length);
}
else if (url.StartsWith("https://"))
{
url = url.Substring("https://".Length);
}
if (url.StartsWith("www."))
{
url = url.Substring("www.".Length);
}
if (url.EndsWith("/"))
{
url = url.Substring(, url.Length - );
}
return url;
}
static public string GetHost(string url)
{
url = NormalizeURL(url);
int slashPosition = url.IndexOf('/');
)
{
url = url.Substring(, slashPosition);
}
return url;
}
}
public class FEXLogSimpleExtractor : Processor
{
public override Schema Produces(string[] columns, string[] args, Schema input)
{
return new Schema("Query:string,QueryDate:string,Url:string,Host:string,POS:int,Click:int");
}
public override IEnumerable<Row> Process(RowSet input, Row output, string[] args)
{
foreach (Row row in input.Rows)
{
string Query = row["Query"].String;
string QueryDate = row["QDate"].String;
var WebEntities = row["Page_Entities_WebResults"].Value as MS.Internal.Bing.DataMining.SearchLogApi.WebResultList;
; i < WebEntities.Count;i++ )
{
string Url = WebEntities[i].TitleUrl;
string Host = URLUtility.GetHost(Url);
int Pos = WebEntities[i].PositionOfEntityInTopLevelRegion;
int Click = WebEntities[i].Clicks.Count;
)
{
output["Query"].Set(Query);
output["QueryDate"].Set(QueryDate);
output["Url"].Set(Url);
output["Host"].Set(Host);
output["POS"].Set(Pos);
output["Click"].Set(Click);
yield return output;
}
}
}
}
}
3,搜索查询和分类的提取代码:
//Script GUID:5b7abb8b-defd-4f2b-b703-9882ee6e960b
//Used for tracking history
REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/RetroIndexProcessor.dll";
REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE @"/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";
USING RetroIndex;
Snapshot =
VIEW "/shares/searchWebLoad/RetroIndex/Views/LatestSnapshot.view"
PARAMS
(
Sample = false,
TierFlag =
);
SELECT Url,
Header,
Body,
HttpHeader,
CodePage
FROM Snapshot;
Uberchunk =
PROCESS
PRODUCE Url,
Country,
Language,
Category
USING RetroIndexProcessor
HAVING string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs";
OUTPUT
TO @"/local/IndexCJK/MobileTopSites/UrlCategory.wsv";
C#:
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
public class Utility
{
public static bool CJKVersionMobileFriendly(string category)
{
if (string.IsNullOrEmpty(category))
{
return false;
}
string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries);
foreach (string cate in cates)
{
if(cate.StartsWith("aa00") &&
(cate.EndsWith("Mobi")
|| cate.EndsWith("CrossDevice")
|| cate.EndsWith("MobileFriendly")
|| cate.EndsWith("MobileUnFriendly")))
{
return true;
}
}
return false;
}
public static bool CJKVersionMobileUnFriendly(string category)
{
if (string.IsNullOrEmpty(category))
{
return false;
}
string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries);
foreach (string cate in cates)
{
if (cate.StartsWith("aa00") && cate.EndsWith("MobileUnFriendly"))
{
return true;
}
}
return false;
}
}
public class CJKVersionMobileOkClassifierProcessor : Processor
{
public override Schema Produces(string[] columns, string[] args, Schema input)
{
return new Schema("Url:string, MobileClassifier:int");
}
public override IEnumerable<Row> Process(RowSet input, Row output, string[] args)
{
foreach (Row row in input.Rows)
{
string Url = row["Url"].String;
string Language = row["Language"].String;
if (!(string.IsNullOrEmpty(Url) == false && string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs"))
{
continue;
}
//classifier features
string Category = row["Category"].String;
string DUPipeline_MobileUrls = row["DUPipeline_MobileUrls"].String;
string DUPipeline_ResponsiveDesignSpans = row["DUPipeline_ResponsiveDesignSpans"].String;
string DUV2_MobileUrl = row["DUV2_MobileUrl"].String;
string InjHdr_MobileOkClassifier_v1 = row["InjHdr_MobileOkClassifier_v1"].String;
string InjHdr_MobileOkX_v1 = row["InjHdr_MobileOkX_v1"].String;
string InjHdr_MobileRedirect_V1 = row["InjHdr_MobileRedirect_V1"].String;
string SpamJunkRuleID = row["SpamJunkRuleID"].String;
string MobileOkClassifier = string.IsNullOrEmpty(InjHdr_MobileOkClassifier_v1) ? InjHdr_MobileOkX_v1 : InjHdr_MobileOkClassifier_v1;
;
if (string.IsNullOrEmpty(DUPipeline_MobileUrls) == false
|| string.IsNullOrEmpty(DUPipeline_ResponsiveDesignSpans) == false
|| string.IsNullOrEmpty(DUV2_MobileUrl) == false
|| string.IsNullOrEmpty(InjHdr_MobileRedirect_V1) == false
|| Utility.CJKVersionMobileFriendly(Category)
|| ("))
{
MobileClassifier = ;
}
"))
{
MobileClassifier = ;
}
")
{
MobileClassifier = ;
}
output["Url"].Set(Url);
output["MobileClassifier"].Set(MobileClassifier);
yield return output;
}
}
}
4,对IE和bing进行union,然后对相同的query进行合并。
Scope
//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a
//Used for tracking history
ie =
EXTRACT Query : string,
Url : string,
Count : int
FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv"
USING DefaultTextExtractor();
bing =
EXTRACT Query : string,
Url : string,
Count : int
FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv"
USING DefaultTextExtractor();
union_all =
SELECT *
FROM ie
UNION ALL
SELECT *
FROM bing;
result =
SELECT Query,
Url,
SUM(Count) AS NewCount
FROM union_all ORDER BY Query;
OUTPUT result
TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";
5,得到Query, Category,ClickCount的对应。
//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a
//Used for tracking history
ie =
EXTRACT Query : string,
Url : string,
Count : int
FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv"
USING DefaultTextExtractor();
bing =
EXTRACT Query : string,
Url : string,
Count : int
FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv"
USING DefaultTextExtractor();
union_all =
SELECT *
FROM ie
UNION ALL
SELECT *
FROM bing;
result =
SELECT Query,
Url,
SUM(Count) AS NewCount
FROM union_all ORDER BY Query;
OUTPUT result
TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";
6,得到了query->Category之后,要算那个category出现的最多。
每一条出现的地方*clickCount然后累加起来。
这里用到了reduce来做。
7,算分数。
Intern---Microsoft Academic China Team的更多相关文章
- Announcing Microsoft Research Open Data – Datasets by Microsoft Research now available in the cloud
The Microsoft Research Outreach team has worked extensively with the external research community to ...
- Team Foundation 中的错误和事件消息
Visual Studio Team System Team Foundation 中的错误和事件消息 Team Foundation 通过显示错误消息和事件消息来通知您操作成功以及操作失败.一部分错 ...
- Microsoft Dynamics CRM 分销行业解决方案
Microsoft Dynamics CRM 分销行业解决方案 方案亮点 360度动态渠道信息管理 充分的客户细分 全面的业务代表考核指标 业务代表管理和能力建设 业务代表过程管理 业务代表费用管理 ...
- Azure China (4) 管理Azure China Storage Account
<Windows Azure Platform 系列文章目录> Update 2015-05-10 强烈建议使用AzCopy工具,AzCopy命令行工具,是经过优化的.高性能Azure S ...
- Azure China (1) Azure公有云落地中国
<Windows Azure Platform 系列文章目录> 微软公有云Microsoft Azure已经落地中国,官方网址:http://www.windowsazure.cn/. 在 ...
- Microsoft server software support for Microsoft Azure virtual machines
http://support.microsoft.com/kb/2721672/en-us Article ID: 2721672 - Last Review: November 22, 2014 ...
- 如何访问Microsoft Azure Storage
首先先要创建存储账户 http://www.cnblogs.com/SignalTips/p/4119128.html 可以通过以下的几个方式访问 通过Visual Studio 2013 Commu ...
- Microsoft TFS 如何显示在Windows 的上下文菜单中
How to showing in Windows Explorer context for TFS I am not sure if this would help or you are willi ...
- Microsoft .NET Pet Shop 简介
最初研究 .NET Pet Shop 的目的是用 Microsoft .NET 实现 Sun 主要的 J2EE 蓝图应用程序 Sun Java Pet Store 同样的应用程序功能. 根据用 .NE ...
随机推荐
- [LeetCode] Gray Code 格雷码
The gray code is a binary numeral system where two successive values differ in only one bit. Given a ...
- JS组件系列——BootstrapTable+KnockoutJS实现增删改查解决方案(四):自定义T4模板快速生成页面
前言:上篇介绍了下ko增删改查的封装,确实节省了大量的js代码.博主是一个喜欢偷懒的人,总觉得这些基础的增删改查效果能不能通过一个什么工具直接生成页面效果,啥代码都不用写了,那该多爽.于是研究了下T4 ...
- VS调试经常打断点打上之后没反应的问题
在调试的时候经常会发现打了断点但是始终不进到程序中来,这是因为访问的这个页面在服务器中有缓存,也就是在iis中产生了缓存.访问的时候直接进到读取的缓存文件, 根本没有读取项目文件,所以打了断点肯定进不 ...
- java日志学习笔记
一.日志家族 Log4j一开始就很强大,在jdk自带日志系统之前,apache就曾经尝试把log4j划为java的一部分,不知为何没能成功,sun还是用了自己很弱的日志系统.为了兼容各个日志系统,ap ...
- 【Quartz】配置最简单的集群
在许多情况,我们希望我们的定时任务是可靠的,不会因系统故障.机器宕机而导致某一笔定时任务不能按时运行.这种情况下,我们就需要为Quartz做个集群. 最简单的情况,有两台机器或两个应用,同时维护一批定 ...
- linux-crontab定时任务
crontab命令常见于Unix和Linux的操作系统之中,用于设置周期性被执行的指令.该命令从标准输入设备读取指令,并将其存放于"crontab"文件中,以供之后读取和执行.通常 ...
- Spring中配置数据源的4种形式
不管采用何种持久化技术,都需要定义数据源.Spring中提供了4种不同形式的数据源配置方式: spring自带的数据源(DriverManagerDataSource),DBCP数据源,C3P0数据源 ...
- LAMP(1) 在VirtualBox里安装Ubuntu Server
问题0.虚拟机中安装lamp环境 问题解决: 来自百度经验 问题1. 用putty远程登陆linux系统,显示network error connection refused 问题解决 问题2. my ...
- 关于input的file框onchange事件触发一次失效的新的解决方法
在google了众多方法后,网上有这么几种方法: 1.替换掉原来的input框 2.remove原来的input框,然后在添加进新的一样的input框 但是不知道为什么非常不幸的是,怎么弄我都弄不出. ...
- Python 【第六章】:Python操作 RabbitMQ、Redis、Memcache、SQLAlchemy
Memcached Memcached 是一个高性能的分布式内存对象缓存系统,用于动态Web应用以减轻数据库负载.它通过在内存中缓存数据和对象来减少读取数据库的次数,从而提高动态.数据库驱动网站的速度 ...