项目二:

AEther:

项目 一、项目需求:对搜索关键词进行类别的统计分析,为了后面的entity-rank做准备。

0,各种关键数据统计:

数据量:1个月数据:about 1000T。

1,对IE的所有浏览搜索的提取代码:

Scope:

//Script GUID:ad2766d3-7aec-4ffa-9bbd-ec2740361999
//Used for tracking history
REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";
UnifiedViewRaw =
    VIEW "/shares/searchDM/distrib/released/CompetitiveUnifiedView/CompetitiveUnifiedPageView.view"
    PARAMS
    (
        Start = @"2016-07-26",
        End = @"2016-07-26",
        Source = @"All"
//      Source = @"DesktopIE"
    );
ClickData =
    SELECT Page_FromPage.Query,
           RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Request_Url)) AS NormalizedUrl,
           COUNT() AS Count
    FROM UnifiedViewRaw
    WHERE Page_FromPage.IsQuery == true AND Page_FromPage.Vertical.ToLower() == "web" AND NOT Request_IsQuery AND Page_FromPage.Market.ToLower() == "zh-cn"
    HAVING Count >=  AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null;

    // Page_FromPage.IsQuery: True if the page is a query page
    // Vertical: Search Vertical of this PageView
    // Request_IsQuery bool: True if this page view is search engine result page

OUTPUT
TO @"/my/entityranker/oneDay/IEqueryclickurlpairsAll.wsv";

C#:

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;

public class CMyUtils
{
    static public string NormalizeURL(string url)
    {
        url = url.ToLower();
        if (url.StartsWith("http://"))
        {
            url = url.Substring("http://".Length);
        }
        else if (url.StartsWith("https://"))
        {
            url = url.Substring("https://".Length);
        }
        if (url.StartsWith("www."))
        {
            url = url.Substring("www.".Length);
        }
        if (url.EndsWith("/"))
        {
            url = url.Substring(, url.Length - );
        }
        return url;
    }

    static public string GetHost(string url)
    {
        url = NormalizeURL(url);
        int slashPosition = url.IndexOf('/');
        )
        {
            url = url.Substring(, slashPosition);
        }
        return url;
    }
}

public class TopReducer : Reducer
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return input.Clone();
    }

    public override IEnumerable<Row> Reduce(RowSet input, Row output, string[] args)
    {
        ;
        foreach (Row row in input.Rows)
        {
            )
            {
                row.Copy(output);
                yield return output;
            }
        }
    }
}

2,对bing的所有浏览搜索的提取代码:

Scope:

//Script GUID:8e9ba2e3-8288-49e1-a5ff-7776d653ae16
//Used for tracking history
REFERENCE "/local/IndexQualityCJK/wb/WordBreaker.dll";
RESOURCE "/local/IndexQualityCJK/wb/unzip.exe";
RESOURCE "/local/IndexQualityCJK/wb/wordbreak.zip";

REFERENCE @"/shares/searchDM/distrib/released/SLAPI/SearchLogApi.dll";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Serializer.exe";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Microsoft.Live.Json.dll";

REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";

USING MS.Internal.Bing.DataMining.SearchLogApi;
USING Microsoft.Live.Json;

SlapiPageView =
      VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogPageView.view"
       //VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogSessionView.view"
       //PARAMS (Start = @"2013-09-01", End = @"2013-09-01", UseSample=false, Dataset="Mobile");
       PARAMS (Start = "2016-07-26", End = "2016-07-26", UseSample=false, Dataset= "Mobile");

ZHCNTraffic =
    SELECT WordBreaker.BreakWords(Query_RawQuery, "zh-cn") AS Query,
           Request_RequestTime.ToString("yyyy-MM-dd") AS QDate,
           Page_Entities_WebResults
    FROM SlapiPageView
    WHERE Request_IsBotVNext == false AND Request_IsMarketingTraffic == false
          AND string.IsNullOrEmpty(Market) == false AND Market.ToLower() == "zh-cn"
          AND string.IsNullOrEmpty(Vertical) == false AND Vertical.ToLower() == "web";
//AND string.IsNullOrEmpty(FormCode) == false AND (FormCode.ToUpper() == "QBLH" OR FormCode.ToUpper() == "QBRE" OR FormCode.ToUpper() == "MSNBHP" OR FormCode.ToUpper() == "MSNFLH" OR FormCode.ToUpper() == "BCNASI");

ProcessWebEntity =
    PROCESS ZHCNTraffic
    USING FEXLogSimpleExtractor;

ClickQueryUrlPairs =
    SELECT Query,
           RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Url)) AS NormalizedUrl,
           COUNT() AS PairCount
    FROM ProcessWebEntity
    WHERE Click >  HAVING PairCount >=  AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null;

OUTPUT
TO @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv";

C#:

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
using MS.Internal.Bing.DataMining.SearchLogApi;

public class URLUtility
{
    static public string NormalizeURL(string url)
    {
        url = url.ToLower();
        if (url.StartsWith("http://"))
        {
            url = url.Substring("http://".Length);
        }
        else if (url.StartsWith("https://"))
        {
            url = url.Substring("https://".Length);
        }
        if (url.StartsWith("www."))
        {
            url = url.Substring("www.".Length);
        }
        if (url.EndsWith("/"))
        {
            url = url.Substring(, url.Length - );
        }
        return url;
    }

    static public string GetHost(string url)
    {
        url = NormalizeURL(url);
        int slashPosition = url.IndexOf('/');
        )
        {
            url = url.Substring(, slashPosition);
        }
        return url;
    }
}

public class FEXLogSimpleExtractor : Processor
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return new Schema("Query:string,QueryDate:string,Url:string,Host:string,POS:int,Click:int");
    }

    public override IEnumerable<Row> Process(RowSet input, Row output, string[] args)
    {
        foreach (Row row in input.Rows)
        {
            string Query = row["Query"].String;
            string QueryDate = row["QDate"].String;
            var WebEntities = row["Page_Entities_WebResults"].Value as MS.Internal.Bing.DataMining.SearchLogApi.WebResultList;

            ; i < WebEntities.Count;i++ )
            {
                string Url = WebEntities[i].TitleUrl;
                string Host = URLUtility.GetHost(Url);
                int Pos = WebEntities[i].PositionOfEntityInTopLevelRegion;
                int Click = WebEntities[i].Clicks.Count;

                )
                {
                    output["Query"].Set(Query);
                    output["QueryDate"].Set(QueryDate);
                    output["Url"].Set(Url);
                    output["Host"].Set(Host);
                    output["POS"].Set(Pos);
                    output["Click"].Set(Click);

                    yield return output;
                }
            }
        }
    }
}

3,搜索查询和分类的提取代码:

//Script GUID:5b7abb8b-defd-4f2b-b703-9882ee6e960b
//Used for tracking history
REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/RetroIndexProcessor.dll";
REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE @"/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";

USING RetroIndex;

Snapshot =
    VIEW "/shares/searchWebLoad/RetroIndex/Views/LatestSnapshot.view"
    PARAMS
    (
        Sample = false,
        TierFlag =
    );

SELECT Url,
       Header,
       Body,
       HttpHeader,
       CodePage
FROM Snapshot;

Uberchunk =
    PROCESS
    PRODUCE Url,
            Country,
            Language,
            Category
    USING RetroIndexProcessor
    HAVING string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs";

OUTPUT
TO @"/local/IndexCJK/MobileTopSites/UrlCategory.wsv";

C#:

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;

public class Utility
{
    public static bool CJKVersionMobileFriendly(string category)
    {
        if (string.IsNullOrEmpty(category))
        {
            return false;
        }

        string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries);
        foreach (string cate in cates)
        {
            if(cate.StartsWith("aa00") &&
                (cate.EndsWith("Mobi")
                || cate.EndsWith("CrossDevice")
                || cate.EndsWith("MobileFriendly")
                || cate.EndsWith("MobileUnFriendly")))
            {
                return true;
            }
        }
        return false;
    }

    public static bool CJKVersionMobileUnFriendly(string category)
    {
        if (string.IsNullOrEmpty(category))
        {
            return false;
        }

        string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries);
        foreach (string cate in cates)
        {
            if (cate.StartsWith("aa00") && cate.EndsWith("MobileUnFriendly"))
            {
                return true;
            }
        }

        return false;
    }
}

public class CJKVersionMobileOkClassifierProcessor : Processor
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return new Schema("Url:string, MobileClassifier:int");
    }

    public override IEnumerable<Row> Process(RowSet input, Row output, string[] args)
    {
        foreach (Row row in input.Rows)
        {
            string Url = row["Url"].String;
            string Language = row["Language"].String;

            if (!(string.IsNullOrEmpty(Url) == false && string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs"))
            {
                continue;
            }

            //classifier features
            string Category = row["Category"].String;
            string DUPipeline_MobileUrls = row["DUPipeline_MobileUrls"].String;
            string DUPipeline_ResponsiveDesignSpans = row["DUPipeline_ResponsiveDesignSpans"].String;
            string DUV2_MobileUrl = row["DUV2_MobileUrl"].String;
            string InjHdr_MobileOkClassifier_v1 = row["InjHdr_MobileOkClassifier_v1"].String;
            string InjHdr_MobileOkX_v1 = row["InjHdr_MobileOkX_v1"].String;
            string InjHdr_MobileRedirect_V1 = row["InjHdr_MobileRedirect_V1"].String;
            string SpamJunkRuleID = row["SpamJunkRuleID"].String;

            string MobileOkClassifier = string.IsNullOrEmpty(InjHdr_MobileOkClassifier_v1) ? InjHdr_MobileOkX_v1 : InjHdr_MobileOkClassifier_v1;

            ;
            if (string.IsNullOrEmpty(DUPipeline_MobileUrls) == false
                || string.IsNullOrEmpty(DUPipeline_ResponsiveDesignSpans) == false
                || string.IsNullOrEmpty(DUV2_MobileUrl) == false
                || string.IsNullOrEmpty(InjHdr_MobileRedirect_V1) == false
                || Utility.CJKVersionMobileFriendly(Category)
                || ("))
            {
                MobileClassifier = ;
            }
            "))
            {
                MobileClassifier = ;
            }
            ")
            {
                MobileClassifier = ;
            }

            output["Url"].Set(Url);
            output["MobileClassifier"].Set(MobileClassifier);
            yield return output;
        }
    }
}

4,对IE和bing进行union,然后对相同的query进行合并。

Scope

//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a
//Used for tracking history

ie =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv"
    USING DefaultTextExtractor();
bing =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv"
    USING DefaultTextExtractor();

union_all =
    SELECT *
    FROM ie
    UNION ALL
    SELECT *
    FROM bing;

result =
    SELECT Query,
           Url,
           SUM(Count) AS NewCount
    FROM union_all ORDER BY Query;
OUTPUT result
TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";

5,得到Query, Category,ClickCount的对应。

//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a
//Used for tracking history

ie =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv"
    USING DefaultTextExtractor();
bing =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv"
    USING DefaultTextExtractor();

union_all =
    SELECT *
    FROM ie
    UNION ALL
    SELECT *
    FROM bing;

result =
    SELECT Query,
           Url,
           SUM(Count) AS NewCount
    FROM union_all ORDER BY Query;
OUTPUT result
TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";

6,得到了query->Category之后,要算那个category出现的最多。

每一条出现的地方*clickCount然后累加起来。

这里用到了reduce来做。

7,算分数。

Intern---Microsoft Academic China Team的更多相关文章

  1. Announcing Microsoft Research Open Data – Datasets by Microsoft Research now available in the cloud

    The Microsoft Research Outreach team has worked extensively with the external research community to ...

  2. Team Foundation 中的错误和事件消息

    Visual Studio Team System Team Foundation 中的错误和事件消息 Team Foundation 通过显示错误消息和事件消息来通知您操作成功以及操作失败.一部分错 ...

  3. Microsoft Dynamics CRM 分销行业解决方案

    Microsoft Dynamics CRM 分销行业解决方案 方案亮点 360度动态渠道信息管理 充分的客户细分 全面的业务代表考核指标 业务代表管理和能力建设 业务代表过程管理 业务代表费用管理 ...

  4. Azure China (4) 管理Azure China Storage Account

    <Windows Azure Platform 系列文章目录> Update 2015-05-10 强烈建议使用AzCopy工具,AzCopy命令行工具,是经过优化的.高性能Azure S ...

  5. Azure China (1) Azure公有云落地中国

    <Windows Azure Platform 系列文章目录> 微软公有云Microsoft Azure已经落地中国,官方网址:http://www.windowsazure.cn/. 在 ...

  6. Microsoft server software support for Microsoft Azure virtual machines

    http://support.microsoft.com/kb/2721672/en-us  Article ID: 2721672 - Last Review: November 22, 2014 ...

  7. 如何访问Microsoft Azure Storage

    首先先要创建存储账户 http://www.cnblogs.com/SignalTips/p/4119128.html 可以通过以下的几个方式访问 通过Visual Studio 2013 Commu ...

  8. Microsoft TFS 如何显示在Windows 的上下文菜单中

    How to showing in Windows Explorer context for TFS I am not sure if this would help or you are willi ...

  9. Microsoft .NET Pet Shop 简介

    最初研究 .NET Pet Shop 的目的是用 Microsoft .NET 实现 Sun 主要的 J2EE 蓝图应用程序 Sun Java Pet Store 同样的应用程序功能. 根据用 .NE ...

随机推荐

  1. [LeetCode] Binary Tree Inorder Traversal 二叉树的中序遍历

    Given a binary tree, return the inorder traversal of its nodes' values. For example:Given binary tre ...

  2. gpu对任意长度的矢量求和

    blockDim.x*gridDim.x 跳过一个grid int <<<参数1,参数2>>>(int *a,int * b,int * c); 如果是一维的,参数 ...

  3. 第一章 MYSQL的架构和历史

    在读第一章的过程中,整理出来了一些重要的概念. 锁粒度  表锁(服务器实现,忽略存储引擎). 行锁(存储引擎实现,服务器没有实现). 事务的ACID概念 原子性(要么全部成功,要么全部回滚). 一致性 ...

  4. AngularJs的$http发送POST请求,php无法接收Post的数据解决方案

      最近在使用AngularJs+Php开发中遇到php后台无法接收到来自AngularJs的数据,在网上也有许多解决方法,却都点到即止.多番摸索后记录下解决方法:tips:当前使用的AngularJ ...

  5. 常用js归纳

    一.获取地址栏参数 /*根据name获取URL参数*/ function getQueryString(name) { var reg = new RegExp("(^|&)&quo ...

  6. VirtualBox内ubuntu10.10系统和windows7 共享文件夹

    材料 virtualbox 4.3.0 ubuntu10.10 window 7 sp1 步骤 1.安装好虚拟机和操作系统,(具体步骤网上有很多) 2.安装虚拟机的增强功能包, 安装完成手动系统重新, ...

  7. 开启A20线(部分译)

    开启A20线 在查看或编写操作系统内核时一定会遇到A20线这个问题.本人对此一直都是似懂非懂的,查了些资料,决定弄明白于是有了这篇文章.其中前一部分是翻译一篇外国博文,但光有这篇文章依旧不能清楚地说明 ...

  8. 小猪cms微信二次开发之怎样分页

    $db=D('Classify'); $zid=$db->where(array('id'=>$this->_GET('fid'),'token'=>$this->tok ...

  9. [WP8.1开发]RSA 使用BouncyCastle 公钥解密

    写应用的时候遇到个服务器返回私钥加密过的数据 ,然后要在客户端用公钥解密的需求 ,一直没找到方法,应用搁置了一个学期,多方搜索,结论就是.net没有实现公钥解密的方法,要自己实现,于是硬着头皮开始看B ...

  10. ReactNative 使用微软的CodePush进行热更新,继续填坑

    1.别被开发环境骗了 在我们开发react native的时候,一键运行工程,js改了,只要cmd+R就可以刷新了.然后会轻易以为真正app上线的时候也是一样,只要app一打开就是最新的. 其实!这是 ...