62,612
社区成员
发帖
与我相关
我的任务
分享
String html = "<html><head><title>开源中国社区</title></head>" + "<body><a>17-06-18_00.tar.gz</a> </body></html>";
Document doc =Jsoup.parse(html);
Elements links = doc.select("a");
for (Element link : links) {
String linkHref = link.attr("href");
String linkText = link.text();
System.out.println(linkHref);
System.out.println(linkText);
}
<td 属性 = "直" 是否换行="yes">
TD中起始标签
和结束标签不同行
内容也是多行的
</td>
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.Scanner;
import java.nio.file.Paths;
import java.nio.file.Path;
import java.io.IOException;
public class Test{
public static void main(String[] args){
String regex = "(?x)<td(\\s+[^=]+=\\s*\"[^\"]*\")*\\s*>\\s*(?<content>[^<]*?)\\s*</td>";
Pattern pattern = Pattern.compile(regex);
String content = loadContent();
Matcher matcher = pattern.matcher(content);
while(matcher.find()){
System.out.println(matcher.group("content"));
}
}
private static String loadContent(){
Path path = Paths.get("source.html");
StringBuffer content = new StringBuffer();
try(Scanner source = new Scanner(path);){
while(source.hasNextLine()){
content.append(source.nextLine() + System.lineSeparator());
}
}catch(IOException e){
e.printStackTrace();
return null;
}
return content.toString();
}
}
package test.gt50;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Test57 {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
URL url = new URL("https://m.78500.cn/zs/ssq/");
InputStream in =url.openStream();
InputStreamReader isr = new InputStreamReader(in,"GBK");
BufferedReader bufr = new BufferedReader(isr);
String str;
StringBuffer sb = new StringBuffer();
while ((str = bufr.readLine()) != null) {
//System.out.println(str);
sb.append(str);
}
bufr.close();
isr.close();
in.close();
String regex = "<td\\s?(class=[\\p{Punct}\\p{Alpha}]+)?>\\s*\\w+\\s*</td>";
Matcher m = Pattern.compile(regex).matcher(sb.toString());
while (m.find()) {
//System.out.println(m.group().replaceAll("[(<td\\s?(class=[\\p{Punct}\\p{Alpha}]+)?)(</td>)]", "").trim());
System.out.println(m.group());
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
// s为你的html
String s = "xxx";
String regex = "<td\\s?(class=[\\p{Punct}\\p{Alpha}]+)?>[\\p{Alpha}\\s\\w(\u4E00-\u9FA5):]*</td>";
Matcher m = Pattern.compile(regex).matcher(s);
while (m.find()) {
String temp = m.group();
String str = temp.replaceAll("</td>", "");
int index = str.indexOf(">");
String ss = str.substring(index+1).trim();
System.out.println(ss);
}
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Test12 {
public static void main(String[] args) {
String s= "<tr class=\"z_tr_hui\">\r\n" +
"<td>20180001</td>\r\n" +
"<td class=\"z_font_red\"> 534234143432 </td>\r\n" +
"<td class=\"z_font_blue\"> 1232 </td>\r\n" +
"<td>1330</td>\r\n" +
"<td>5453</td>\r\n" +
"</tr>\r\n" +
"<tr class=\"z_tr_fen\">\r\n" +
"<td>20180002</td>\r\n" +
"<td class=\"z_font_red\"> 534234143432 </td>\r\n" +
"<td class=\"z_font_blue\"> 1233 </td>\r\n" +
"<td>1220</td>\r\n" +
"<td>5333</td>\r\n" +
"</tr>\r\n" +
"<tr class=\"z_tr_hui\">\r\n" +
"<td>20180003</td>\r\n" +
"<td class=\"z_font_red\"> 534234143432 </td>\r\n" +
"<td class=\"z_font_blue\"> 1234 </td>\r\n" +
"<td>1231</td>\r\n" +
"<td>5354</td>\r\n" +
"</tr>\r\n" +
"<tr class=\"z_tr_fen\">\r\n" +
"<td>20180004</td>\r\n" +
"<td class=\"z_font_red\"> 534234143432 </td>\r\n" +
"<td class=\"z_font_blue\"> 1235 </td>\r\n" +
"<td>1230</td>\r\n" +
"<td>5353</td>\r\n" +
"</tr>";
String regex = "<td\\s?(class=[\\p{Punct}\\p{Alpha}]+)?>\\s*\\w+\\s*</td>";
Matcher m = Pattern.compile(regex).matcher(s);
while (m.find()) {
System.out.println(m.group().replaceAll("[(<td\\s?(class=[\\p{Punct}\\p{Alpha}]+)?)(</td>)]", "").trim());
}
}
}