-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathHtml2TxtUtil.java
More file actions
58 lines (47 loc) · 2.03 KB
/
Html2TxtUtil.java
File metadata and controls
58 lines (47 loc) · 2.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
package cn.sinobest.ypgj.util;
import java.util.regex.Pattern;
/**
* 过滤JSP页面标签
*
* @author linmi
*
*/
public class Html2TxtUtil {
public Html2TxtUtil() {
//
}
public static String html2txt(String inputString){
String htmlStr = inputString; //含html标签的字符串
String textStr ="";
java.util.regex.Pattern p_script;
java.util.regex.Matcher m_script;
java.util.regex.Pattern p_style;
java.util.regex.Matcher m_style;
java.util.regex.Pattern p_html;
java.util.regex.Matcher m_html;
java.util.regex.Pattern p_ba;
java.util.regex.Matcher m_ba;
try {
String regEx_script = "<[\\s]*?script[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?script[\\s]*?>"; //定义script的正则表达式{或<script[^>]*?>[\\s\\S]*?<\\/script> }
String regEx_style = "<[\\s]*?style[^>]*?>[\\s\\S]*?<[\\s]*?\\/[\\s]*?style[\\s]*?>"; //定义style的正则表达式{或<style[^>]*?>[\\s\\S]*?<\\/style> }
String regEx_html = "<[^>]+>"; //定义HTML标签的正则表达式
String patternStr = "\\s+";
p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
m_script = p_script.matcher(htmlStr);
htmlStr = m_script.replaceAll(""); //过滤script标签
p_style = Pattern.compile(regEx_style,Pattern.CASE_INSENSITIVE);
m_style = p_style.matcher(htmlStr);
htmlStr = m_style.replaceAll(""); //过滤style标签
p_html = Pattern.compile(regEx_html,Pattern.CASE_INSENSITIVE);
m_html = p_html.matcher(htmlStr);
htmlStr = m_html.replaceAll(""); //过滤html标签
p_ba = Pattern.compile(patternStr,Pattern.CASE_INSENSITIVE);
m_ba = p_ba.matcher(htmlStr);
//htmlStr = m_ba.replaceAll(""); //过滤空格
textStr = htmlStr;
}catch(Exception e) {
System.err.println("Html2Text: " + e.getMessage());
}
return textStr;//返回文本字符串
}
}