



msg.full <- function(path)
con <- file(path, open = "rt", encoding = "latin1")
msg <- readLines(con)



get.from <- function(msg.vec)

from <- msg.vec[grepl("From: ", msg.vec)]

from <- strsplit(from, '[":<> ]')[[1]]

from <- from[which(from != "" & from != " ")]

return(from[grepl("@", from)][1])


get.subject <- function(msg.vec)
subj <- msg.vec[grepl("Subject: ", msg.vec)]
if(length(subj) > 0)
return(strsplit(subj, "Subject: ")[[1]][2])


get.msg <- function(msg.vec)
msg <- msg.vec[seq(which(msg.vec == "")[1] + 1, length(msg.vec), 1)]
return(paste(msg, collapse = "\n"))


get.date <- function(msg.vec)

date.grep <- grepl("^Date: ", msg.vec)
date.grep <- which(date.grep == TRUE)

date <- msg.vec[date.grep[1]]

date <- strsplit(date, "\\+|\\-|: ")[[1]][2]

date <- gsub("^\\s+|\\s+$", "", date)

return(strtrim(date, 25))


parse.email <- function(path)
full.msg <- msg.full(path)
date <- get.date(full.msg)
from <- get.from(full.msg)
subj <- get.subject(full.msg)
msg <- get.msg(full.msg)
return(c(date, from, subj, msg, path))


easyham.docs <- dir(easyham.path)
easyham.docs <- easyham.docs[which(easyham.docs != "cmds")]

easyham.parse <- lapply(easyham.docs,
function(p) parse.email(file.path(easyham.path, p)))

# Convert raw data from list to data frame
ehparse.matrix <- do.call(rbind, easyham.parse)
allparse.df <- data.frame(ehparse.matrix, stringsAsFactors = FALSE)
names(allparse.df) <- c("Date", "From.EMail", "Subject", "Message", "Path")




#Web,04 Dec 2002 11:36:32

#04 Dec 2002 11:36:32


date.converter <- function(dates, pattern1, pattern2)
pattern1.convert <- strptime(dates, pattern1)
pattern2.convert <- strptime(dates, pattern2)
pattern1.convert[is.na(pattern1.convert)] <- pattern2.convert[is.na(pattern1.convert)]

pattern1 <- "%a, %d %b %Y %H:%M:%S"
pattern2 <- "%d %b %Y %H:%M:%S"


Sys.setlocale("LC_TIME", "C");

allparse.df$Date <- date.converter(allparse.df$Date, pattern1, pattern2)

allparse.df$Subject <- tolower(allparse.df$Subject)
allparse.df$From.EMail <- tolower(allparse.df$From.EMail)


priority.df <- allparse.df[with(allparse.df, order(Date)), ]

priority.train <- priority.df[1:(round(nrow(priority.df) / 2)), ]




#with(priority.train, table(From.EMail))可以统计priority.train中From.EMail的频数



from.weight <- melt(with(priority.train, table(From.EMail)), value.name="Freq")


from.weight <- from.weight[with(from.weight, order(Freq)), ]


from.ex <- subset(from.weight, Freq > 6)

from.scales <- ggplot(from.ex) +
geom_rect(aes(xmin = 1:nrow(from.ex) - 0.5,
xmax = 1:nrow(from.ex) + 0.5,
ymin = 0,
ymax = Freq,
fill = "lightgrey",
color = "darkblue")) +
scale_x_continuous(breaks = 1:nrow(from.ex), labels = from.ex$From.EMail) +
coord_flip() +
scale_fill_manual(values = c("lightgrey" = "lightgrey"), guide = "none") +
scale_color_manual(values = c("darkblue" = "darkblue"), guide = "none") +
ylab("Number of Emails Received (truncated at 6)") +
xlab("Sender Address") +
theme_bw() +
theme(axis.text.y = element_text(size = 5, hjust = 1))





from.weight <- transform(from.weight,Weight = log(Freq + 1),log10Weight = log10(Freq + 1))


from.rescaled <- ggplot(from.weight, aes(x = 1:nrow(from.weight))) +
geom_line(aes(y = Weight, linetype = "ln")) +
geom_line(aes(y = log10Weight, linetype = "log10")) +
geom_line(aes(y = Freq, linetype = "Absolute")) +
scale_linetype_manual(values = c("ln" = 1,
"log10" = 2,
"Absolute" = 3),
name = "Scaling") +
xlab("") +
ylab("Number of emails Receieved") +
theme_bw() +
theme(axis.text.y = element_blank(), axis.text.x = element_blank())



find.threads <- function(email.df)
response.threads <- strsplit(email.df$Subject, "re: ")

is.thread <- sapply(response.threads,
function(subj) ifelse(subj[1] == "", TRUE, FALSE))

threads <- response.threads[is.thread]
senders <- email.df$From.EMail[is.thread]

threads <- sapply(threads,function(t) paste(t[2:length(t)], collapse = "re: "))


threads.matrix <- find.threads(priority.train)


email.thread <- function(threads.matrix)

senders <- threads.matrix[, 1]

senders.freq <- table(senders)

senders.matrix <- cbind(names(senders.freq),
log(senders.freq + 1))
senders.df <- data.frame(senders.matrix, stringsAsFactors=FALSE)

row.names(senders.df) <- 1:nrow(senders.df)
names(senders.df) <- c("From.EMail", "Freq", "Weight")
senders.df$Freq <- as.numeric(senders.df$Freq)
senders.df$Weight <- as.numeric(senders.df$Weight)




thread.counts <- function(thread, email.df)
thread.times <- email.df$Date[which(email.df$Subject == thread |email.df$Subject == paste("re:", thread))]

freq <- length(thread.times)
min.time <- min(thread.times)
max.time <- max(thread.times)

time.span <- as.numeric(difftime(max.time, min.time, units = "secs"))

if(freq < 2)
return(c(NA, NA, NA))

trans.weight <- freq / time.span

log.trans.weight <- 10 + log(trans.weight, base = 10)
return(c(freq, time.span, log.trans.weight))



senders.df <- email.thread(threads.matrix)


get.threads <- function(threads.matrix, email.df)


threads <- unique(threads.matrix[, 2])
thread.counts <- lapply(threads,function(t) thread.counts(t, email.df))
thread.matrix <- do.call(rbind, thread.counts)
return(cbind(threads, thread.matrix))


thread.weights <- get.threads(threads.matrix, priority.train)

thread.weights <- data.frame(thread.weights, stringsAsFactors = FALSE)
names(thread.weights) <- c("Thread", "Freq", "Response", "Weight")
thread.weights$Freq <- as.numeric(thread.weights$Freq)
thread.weights$Response <- as.numeric(thread.weights$Response)
thread.weights$Weight <- as.numeric(thread.weights$Weight)


thread.weights <- subset(thread.weights, is.na(thread.weights$Freq) == FALSE)


term.counts <- function(term.vec, control)
vec.corpus <- Corpus(VectorSource(term.vec))
vec.tdm <- TermDocumentMatrix(vec.corpus, control = control)

thread.terms <- term.counts(thread.weights$Thread,
control = list(stopwords = TRUE))

thread.terms <- names(thread.terms)


term.weights <- sapply(thread.terms,
function(t) mean(thread.weights$Weight[grepl(t, thread.weights$Thread, fixed = TRUE)]))

term.weights <- data.frame(list(Term = names(term.weights),
Weight = term.weights),
stringsAsFactors = FALSE,
row.names = 1:length(term.weights))

# Finally, create weighting based on frequency of terms in email.
# Will be similar to SPAM detection, but in this case weighting
# high words that are particularly HAMMMY.


msg.terms <- term.counts(priority.train$Message,
control = list(stopwords = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE))
msg.weights <- data.frame(list(Term = names(msg.terms),
Weight = log(msg.terms, base = 10)),
stringsAsFactors = FALSE,
row.names = 1:length(msg.terms))


msg.weights <- subset(msg.weights, Weight > 0)


get.weights <- function(search.term, weight.df, term = TRUE)
if(length(search.term) > 0)
term.match <- match(names(search.term), weight.df$Term)
term.match <- match(search.term, weight.df$Thread)
match.weights <- weight.df$Weight[which(!is.na(term.match))]

if(length(match.weights) < 1)



rank.message <- function(path)

msg <- parse.email(path)

# Weighting based on message author
# First is just on the total frequency

from <- ifelse(length(which(from.weight$From.EMail == msg[2])) > 0,
from.weight$Weight[which(from.weight$From.EMail == msg[2])],
# Second is based on senders in threads, and threads themselves
thread.from <- ifelse(length(which(senders.df$From.EMail == msg[2])) > 0,
senders.df$Weight[which(senders.df$From.EMail == msg[2])],
subj <- strsplit(tolower(msg[3]), "re: ")
is.thread <- ifelse(subj[[1]][1] == "", TRUE, FALSE)


activity <- get.weights(subj[[1]][2], thread.weights, term = FALSE)
activity <- 1

# Next, weight based on terms

# Weight based on terms in threads
thread.terms <- term.counts(msg[3], control = list(stopwords = TRUE))
thread.terms.weights <- get.weights(thread.terms, term.weights)

# Weight based terms in all messages
msg.terms <- term.counts(msg[4],
control = list(stopwords = TRUE,
removePunctuation = TRUE,
removeNumbers = TRUE))
msg.weights <- get.weights(msg.terms, msg.weights)

# Calculate rank by interacting all weights

rank <- prod(from,
return(c(msg[1], msg[2], msg[3], rank))


train.paths <- priority.df$Path[1:(round(nrow(priority.df) / 2))]
test.paths <- priority.df$Path[((round(nrow(priority.df) / 2)) + 1):nrow(priority.df)]

# Now, create a full-featured training set.

train.ranks <- suppressWarnings(lapply(train.paths, rank.message))
train.ranks.matrix <- do.call(rbind, train.ranks)
train.ranks.matrix <- cbind(train.paths, train.ranks.matrix, "TRAINING")
train.ranks.df <- data.frame(train.ranks.matrix, stringsAsFactors = FALSE)
names(train.ranks.df) <- c("Message", "Date", "From", "Subj", "Rank", "Type")
train.ranks.df$Rank <- as.numeric(train.ranks.df$Rank)

# Set the priority threshold to the median of all ranks weights

priority.threshold <- median(train.ranks.df$Rank)

# Visualize the results to locate threshold
threshold.plot <- ggplot(train.ranks.df, aes(x = Rank)) +
stat_density(aes(fill="darkred")) +
geom_vline(xintercept = priority.threshold, linetype = 2) +
scale_fill_manual(values = c("darkred" = "darkred"), guide = "none") +



# Classify as priority, or not,以中位数分类,1表示优先
train.ranks.df$Priority <- ifelse(train.ranks.df$Rank >= priority.threshold, 1, 0)


test.ranks <- suppressWarnings(lapply(test.paths,rank.message))
test.ranks.matrix <- do.call(rbind, test.ranks)
test.ranks.matrix <- cbind(test.paths, test.ranks.matrix, "TESTING")
test.ranks.df <- data.frame(test.ranks.matrix, stringsAsFactors = FALSE)
names(test.ranks.df) <- c("Message","Date","From","Subj","Rank","Type")
test.ranks.df$Rank <- as.numeric(test.ranks.df$Rank)
test.ranks.df$Priority <- ifelse(test.ranks.df$Rank >= priority.threshold, 1, 0)

# Finally, we combine the data sets.

final.df <- rbind(train.ranks.df, test.ranks.df)

final.df$Date <- date.converter(final.df$Date, pattern1, pattern2)

final.df <- final.df[rev(with(final.df, order(Date))), ]

# Save final data set and plot results.
write.csv(final.df, file.path("data", "final_df.csv"), row.names = FALSE)


testing.plot <- ggplot(subset(final.df, Type == "TRAINING"), aes(x = Rank)) +
stat_density(aes(fill = Type, alpha = 0.65)) +
stat_density(data = subset(final.df, Type == "TESTING"),
aes(fill = Type, alpha = 0.65)) +
geom_vline(xintercept = priority.threshold, linetype = 2) +
scale_alpha(guide = "none") +
scale_fill_manual(values = c("TRAINING" = "darkred", "TESTING" = "darkblue")) +







