From b8945764ef8126eb781aa08257367b2c7db9d0d2 Mon Sep 17 00:00:00 2001
From: gtrivedi <trivedigaurav@gmail.com>
Date: Thu, 23 Jul 2015 15:37:31 -0400
Subject: [PATCH] Fix header seperation in Pathology

---
 .../nih/backend/featureVector/Preprocess.java | 89 ++++++++++---------
 ...xtFileFeedbackManager_LibSVM_WordTree.java |  4 +-
 2 files changed, 48 insertions(+), 45 deletions(-)

diff --git a/src/edu/pitt/cs/nih/backend/featureVector/Preprocess.java b/src/edu/pitt/cs/nih/backend/featureVector/Preprocess.java
index 11e7365..69e10bb 100755
--- a/src/edu/pitt/cs/nih/backend/featureVector/Preprocess.java
+++ b/src/edu/pitt/cs/nih/backend/featureVector/Preprocess.java
@@ -267,40 +267,42 @@ public static String[] separatePathologyHeaderFooter(String input) throws Except
                         sb.append("\n");
 			iLine++;
 		}
-                // make header
-                text[0] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
+        
+        // make header
+        text[0] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
                 
-                // content starts from here until meet Pathologist (beginning of a sentence)
-                sb = new StringBuilder();
-		if(iLine < allLines - 1) {
-			iLine++;
-//			text += lines[iLine].trim() + "\n";
-                        sb.append(lines[iLine].trim());
-                        sb.append("\n");
-		}
-		else {
-			iLine = -1;
-		}
-		iLine++;
+//                 // content starts from here until meet Pathologist (beginning of a sentence)
+//                 sb = new StringBuilder();
+// 		if(iLine < allLines - 1) {
+// 			iLine++;
+// //			text += lines[iLine].trim() + "\n";
+//                         sb.append(lines[iLine].trim());
+//                         sb.append("\n");
+// 		}
+// 		else {
+// 			iLine = -1;
+// 		}
+// 		iLine++;
 		
-		Pattern p = Pattern.compile("^\\s*Pathologist");
-		Matcher m = p.matcher(lines[iLine]);
-		while(iLine < allLines && !m.find()) {
-			if(!lines[iLine].trim().equals("")) {
-//				text += lines[iLine].trim() + "\n";
-                                sb.append(lines[iLine].trim());
-                                sb.append("\n");
-			}
-			iLine++;
-			m = p.matcher(lines[iLine]);
-		}
-		// skip until meet GROSS DESCRIPTION
-		p = Pattern.compile("^GROSS DESCRIPTION");
-		m = p.matcher(lines[iLine]);
-		while(iLine < allLines && !m.find()) {
-			iLine++;
-			m = p.matcher(lines[iLine]);
-		}
+// 		Pattern p = Pattern.compile("^\\s*Pathologist");
+// 		Matcher m = p.matcher(lines[iLine]);
+// 		while(iLine < allLines && !m.find()) {
+// 			if(!lines[iLine].trim().equals("")) {
+// //				text += lines[iLine].trim() + "\n";
+//                                 sb.append(lines[iLine].trim());
+//                                 sb.append("\n");
+// 			}
+// 			iLine++;
+// 			m = p.matcher(lines[iLine]);
+// 		}
+// 		// skip until meet GROSS DESCRIPTION
+// 		p = Pattern.compile("^GROSS DESCRIPTION");
+// 		m = p.matcher(lines[iLine]);
+// 		while(iLine < allLines && !m.find()) {
+// 			iLine++;
+// 			m = p.matcher(lines[iLine]);
+// 		}
+
 		// keep contain until meet E_O_R
 		while(iLine < allLines && lines[iLine].indexOf("E_O_R") == -1) {
 			if(!lines[iLine].trim().equals("")) {
@@ -310,22 +312,23 @@ public static String[] separatePathologyHeaderFooter(String input) throws Except
 			}
 			iLine++;
 		}
-                // make content
+        
+        // make content
 		// remove stop word
-                // and **ID-NUM
-                text[1] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
+        // and **ID-NUM
+        text[1] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
 		text[1] = text[1].replaceAll("\\*\\*ID\\-NUM", "");
 		text[1] = text[1].replaceAll("\\*\\*INITIALS", "");
 		text[1] = text[1].replaceAll("_{3,}", "");
                 
-                // footer starts from here to the end
-                sb = new StringBuilder();
-                while(iLine < allLines) {
-                    sb.append(lines[iLine++].trim());
-                    sb.append("\n");
-                }
-                // make the footer
-                text[2] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
+        // footer starts from here to the end
+        sb = new StringBuilder();
+        while(iLine < allLines) {
+            sb.append(lines[iLine++].trim());
+            sb.append("\n");
+        }
+        // make the footer
+        text[2] = TextUtil.removeDuplicatedSpace(sb.toString().trim());
                 
 		return text;
 	}
diff --git a/src/edu/pitt/cs/nih/backend/feedback/TextFileFeedbackManager_LibSVM_WordTree.java b/src/edu/pitt/cs/nih/backend/feedback/TextFileFeedbackManager_LibSVM_WordTree.java
index 399e1bd..b4452af 100644
--- a/src/edu/pitt/cs/nih/backend/feedback/TextFileFeedbackManager_LibSVM_WordTree.java
+++ b/src/edu/pitt/cs/nih/backend/feedback/TextFileFeedbackManager_LibSVM_WordTree.java
@@ -981,13 +981,13 @@ public String wordTreeSkippedNGramPatternString(Map<String, String> spanMap)
 		patternStr = sb.toString().trim().replaceAll(whiteSpaceBeforePunc, "\\\\s{0,1}");
 		// in case the first skipped n-gram is a punctuation
 		// there would be no white space before the n-gram
-		patternStr = patternStr.replaceAll(" (?=(\\(\\\\S\\+))", "\\\\s*");
+		patternStr = patternStr.replaceAll(" (?=(\\(\\\\S\\+))", "\\\\W*");
 //		// quote the string
 //		patternStr = TextUtil.escapeRegex(patternStr);
 		// reverse 's
 		patternStr = patternStr.replaceAll("'s", "' {0,1}s");
 		// replace whitespace by \s
-		patternStr = patternStr.replaceAll("\\s(?!\\{)", "\\\\s+");
+		patternStr = patternStr.replaceAll("\\s(?!\\{)", "\\\\W+");
 
 //		System.out.println("Search pattern: " + patternStr);