求个xpath的写法,不知道该怎么写了

机器学习之禅 2014-06-04 10:40:34
我想从搜索页面抓取所有标题,
但是现在有个问题,根据关键词匹配后,标题中的部分内容标红了,
比如说,我搜索"招远 亲人'体现在代码中如下:
<a href="......">
<font color="red">招远</font>
”血案遇难者家属在案发地祭奠“<font color="red">亲人</font>
</a>

那么每次搜索这个a标签是固定的,但是里面的font标签是不固定的,
怎么能抽取出整个标题呢?即“招远血案遇难者家属在案发地祭奠亲人”
...全文
336 7 打赏 收藏 转发到动态 举报
写回复
用AI写文章
7 条回复
切换为时间正序
请发表友善的回复…
发表回复
机器学习之禅 2014-06-06
  • 打赏
  • 举报
回复
引用 3 楼 suciver 的回复:
又是这种月经贴,解析网页请用jsoup,只要会jquery的选择器(不会可以google)这些都是很简单的
最近google上不去了。。
lovejavaee 2014-06-05
  • 打赏
  • 举报
回复
发一个项目的解析XML的Util类给你

package com.jst.common.util;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;

/**
*
* @author wangyulong
* @date 2011-05-11
*
*/
public class MessageDom4jUtil {

public static final String MSG_HEAD = "/MSG/HEAD";

public static final String MSG_BODY_REC = "/MSG/BODY/REC";

public static final String MSG_BODY = "/MSG/BODY";

// 返回信息的节点路径
public static final String MSG_HEAD_RET_MSG = "/MSG/HEAD/RET_MSG";

// 返回代码的节点路径
public static final String MSG_HEAD_RET_CODE = "/MSG/HEAD/RET_CODE";

// 返回的代码值
public static final String RET_CODE_TRUE = "T";

// 返回的代码值
public static final String RET_CODE_FALSE = "F";

public static final Log log = LogFactory.getLog(MessageDom4jUtil.class);

/**
* 根据node查询,Text中的值
*
* @param text
* @param node
* @return
* @throws DocumentException
*/
public static String getText(String text, String node) throws DocumentException {
Document document = createDocument(text);
Element element = (Element) document.selectObject(node);
if (element == null) {
throw new DocumentException("xml文档格式不正确,不能找到" + node);
}
return element.getStringValue();
}

/**
* 返回 /MSG/HEAD/RET_CODE RET_CODE的值T或F
*
* @param xml
* @return
*/
public static String getRetCodeValue(String xml) throws DocumentException {
Document document = createDocument(xml);
Element element = (Element) document.selectObject(MSG_HEAD_RET_CODE);
return element.getStringValue();
}

/**
* 返回/MSG/HEAD/RET_MSG RET_MSG的值
*
* @param xml
* @return
*/
public static String getRetMsgValue(String xml) throws DocumentException {
Document document = createDocument(xml);
Element element = (Element) document.selectObject(MSG_HEAD_RET_CODE);
element = (Element) document.selectObject("/MSG/HEAD/RET_MSG");
return element.getStringValue();
}

/**
* 组装body并返回xml的字符串
*
* @param retCode
* @param retMsg
* @return
*/
public static String getRetMsg(String retCode, String retMsg,String recStr) {
StringBuffer head = new StringBuffer();
head.append("<?xml version='1.0' encoding='GBK'?>");
head.append("<MSG>");
head.append("<HEAD>");
head.append("<RET_CODE>").append(retCode).append("</RET_CODE>");
head.append("<RET_MSG>").append(retMsg).append("</RET_MSG>");
head.append("</HEAD>");
head.append("<BODY>");
if(StringUtil.isNotEmpty(recStr)){
head.append(recStr);
}
head.append("</BODY>");
head.append("</MSG>");
return head.toString();
}
/**
* 组装body并返回xml的字符串
*
* @param retCode
* @param retMsg
* @return
*/
public static String getRetMsg(String retCode, String retMsg) {
StringBuffer head = new StringBuffer();
head.append("<?xml version='1.0' encoding='GBK'?>");
head.append("<MSG>");
head.append("<HEAD>");
head.append("<RET_CODE>").append(retCode).append("</RET_CODE>");
head.append("<RET_MSG>").append(retMsg).append("</RET_MSG>");
head.append("</HEAD>");
head.append("<BODY>");
head.append("</BODY>");
head.append("</MSG>");
return head.toString();
}

/**
* 组装body并返回xml的字符串
*
* @param ret_code
* @param ret_msg
* @return
*/
public static String getRetMsg(String array[]) {
StringBuffer head = new StringBuffer();
head.append("<?xml version='1.0' encoding='GBK'?>");
head.append("<MSG>");
head.append("<HEAD>");
for (int i = 0; i < array.length; i = i + 2) {
head.append("<").append(array[i]).append(">");
head.append(array[i + 1]);
head.append("</").append(array[i]).append(">");
}
head.append("</HEAD>");
head.append("<BODY>");
head.append("</BODY>");
head.append("</MSG>");
return head.toString();
}

/**
* 组装body并返回xml的字符串
*
* @param ret_code
* @param ret_msg
* @return
* @throws DocumentException
*/
public static Document getReqParaString() throws DocumentException {
StringBuffer head = new StringBuffer();
head.append("<?xml version='1.0' encoding='GBK'?>");
head.append("<MSG>");
head.append("<HEAD>");
head.append("<REQ_PARA>");
head.append("</REQ_PARA>");
head.append("</HEAD>");
head.append("<BODY>");
head.append("</BODY>");
head.append("</MSG>");
return createDocument(head.toString());
}

/**
* 返回/MSG/BODY 元素对象
*
* @param document
* @return
* @throws DocumentException
*/
public static Element selectBodyElement(Document document) throws DocumentException {
return (Element) document.selectObject(MSG_BODY);
}

/**
* 返回MSG/BODY/REC 元素对象
*
* @param msg
* @return
* @throws DocumentException
*/
public static Element selectRecElement(String msg) throws DocumentException {
Document document = createDocument(msg);
return (Element) document.selectObject(MSG_BODY_REC);
}

/**
* 返回MSG/BODY/REQ_PARA 元素对象
*
* @param msg
* @return
* @throws DocumentException
*/
public static Element selectHeadElement(Document document) throws DocumentException {
return (Element) document.selectObject(MSG_HEAD);
}

/**
* 返回MSG/BODY/REQ_PARA 元素对象
*
* @param msg
* @return
* @throws DocumentException
*/
public static Element selectHeadElement(String msg) throws DocumentException {
Document document = createDocument(msg);
return (Element) document.selectObject(MSG_HEAD);
}

public static Document createDocument(String msg) throws DocumentException {
try {
return DocumentHelper.parseText(msg);
} catch (DocumentException e) {
log.error(e);
throw new DocumentException("xml文件格式不正确");
}
}

/**
* 返回 MSG/BODY/REC 元素对象
*
* @param document
* @return
* @throws DocumentException
*/
public static Element selectRecElement(Document document) throws DocumentException {
return (Element) document.selectObject(MSG_BODY_REC);
}

/**
* test method
*
* @param args
* @throws DocumentException
*/
public static void main(String[] args) throws DocumentException {
/*
* StringBuffer head = new StringBuffer();
* head.append("<?xml version='1.0' encoding='GBK'?>");
* head.append("<MSG>\n"); head.append(" <HEAD>\n");
* head.append(" <REQ_PARA>\n");
* head.append(" <RET_CODE>").append("F").append("</RET_CODE>\n");
* head.append(" <RET_MSG>").append("数据错误").append("</RET_MSG>\n");
* head.append(" </REQ_PARA>\n"); head.append(" </HEAD>\n");
* head.append(" <BODY>\n"); head.append(" <REC>\n");
* head.append(" <NAME>\n"); head.append(" 中国");
* head.append(" </NAME>\n"); head.append(" <AGE>\n");
* head.append(" 11"); head.append(" </AGE>\n");
* head.append(" </REC>\n"); head.append(" <REC>\n");
* head.append(" <NAME>\n"); head.append(" 美国");
* head.append(" </NAME>\n"); head.append(" <AGE>\n");
* head.append(" 12"); head.append(" </AGE>\n");
* head.append(" </REC>\n"); head.append(" </BODY>\n");
* head.append("</MSG>\n"); System.out.println("--"); //
* System.out.println(getBodyRec(head.toString())); Element doucment =
* selectReqParaElement(head.toString());
* System.out.println(doucment.selectSingleNode("RET_CODE").getText());
* System.out.println(doucment.elementText("RET_CODE"));
*
* System.out.println(getRetMsg(new
* String[]{"code","123","name","gg"}));
*/
String xmlStr = "<?xml version=\"1.0\" encoding=\"GBK\"?>\n<MSG><HEAD><REQ_PARA><USER_CODE>test1</USER_CODE><PASSWORD>password</PASSWORD><TYPE>login</TYPE></REQ_PARA></HEAD><BODY/></MSG>";
System.out.println(xmlStr);
// log.debug(selectReqParaElement(xmlStr).elementText("TYPE"));
}
}

lovejavaee 2014-06-05
  • 打赏
  • 举报
回复
使用的是dom4j的方式解析xml
lovejavaee 2014-06-05
  • 打赏
  • 举报
回复

/**
	 * 返回/MSG/HEAD/RET_MSG RET_MSG的值
	 * 
	 * @param xml
	 * @return
	 */
	public static String getRetMsgValue(String xml) throws DocumentException {
		Document document = createDocument(xml);
		Element element = (Element) document.selectObject(MSG_HEAD_RET_CODE);
		element = (Element) document.selectObject("/MSG/HEAD/RET_MSG");
		return element.getStringValue();
	}

tony4geek 2014-06-04
  • 打赏
  • 举报
回复

String s="<font color=\"red\">招远</font>”血案遇难者家属在案发地祭奠“<font color=\"red\">亲人</font>";
System.out.println(s.replaceAll("<font[^>].*?>|</font>", ""));
suciver 2014-06-04
  • 打赏
  • 举报
回复
又是这种月经贴,解析网页请用jsoup,只要会jquery的选择器(不会可以google)这些都是很简单的
艾德 2014-06-04
  • 打赏
  • 举报
回复
/a//text()

81,094

社区成员

发帖
与我相关
我的任务
社区描述
Java Web 开发
社区管理员
  • Web 开发社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧