直接了当见代码:
url <- "http://tieba.baidu.com/p/1787614150" #获取网页
web <- readLines(url,encoding="Latin-1") #分行
name <- web[grep('<cc><div id="post_content_',web,fixed=T)] #正则表达式找到回复所在行
pattern <- '(?<=class="d_post_content">)(.*)(\\b[a-zA-Z0-9]+@([a-zA-Z0-9]+\\.){1,2}[a-zA-Z0-9]+)(?![a-zA-Z])' #正则表达式表示邮箱格式
email.exist <- grep(pattern,name,perl=T,value=T) #提取含有邮箱的回复
email.position <- gregexpr(pattern,email.exist,perl=T) #找出邮箱的位置
email.name=0
for (i in 1:length(email.exist)){
email.name[i] <- substr(email.exist[i], attr(email.position[[i]],'capture.star')[2],
attr(email.position[[i]],'capture.star')[2]+attr(email.position[[i]],'capture.length')[2]-1)
} #提取邮箱
email.txt <- paste(email.name,collapse=';') #用分号间隔邮箱,便于发送
url <- "http://tieba.baidu.com/p/1787614150" #获取网页
web <- readLines(url,encoding="Latin-1") #分行
name <- web[grep('<cc><div id="post_content_',web,fixed=T)] #正则表达式找到回复所在行
pattern <- '(?<=class="d_post_content">)(.*)(\\b[a-zA-Z0-9]+@([a-zA-Z0-9]+\\.){1,2}[a-zA-Z0-9]+)(?![a-zA-Z])' #正则表达式表示邮箱格式
email.exist <- grep(pattern,name,perl=T,value=T) #提取含有邮箱的回复
email.position <- gregexpr(pattern,email.exist,perl=T) #找出邮箱的位置
email.name=0
for (i in 1:length(email.exist)){
email.name[i] <- substr(email.exist[i], attr(email.position[[i]],'capture.star')[2],
attr(email.position[[i]],'capture.star')[2]+attr(email.position[[i]],'capture.length')[2]-1)
} #提取邮箱
email.txt <- paste(email.name,collapse=';') #用分号间隔邮箱,便于发送