From b8945764ef8126eb781aa08257367b2c7db9d0d2 Mon Sep 17 00:00:00 2001 From: gtrivedi Date: Thu, 23 Jul 2015 15:37:31 -0400 Subject: [PATCH] Fix header seperation in Pathology --- .../nih/backend/featureVector/Preprocess.java | 89 ++++++++++--------- ...xtFileFeedbackManager_LibSVM_WordTree.java | 4 +- 2 files changed, 48 insertions(+), 45 deletions(-) diff --git a/src/edu/pitt/cs/nih/backend/featureVector/Preprocess.java b/src/edu/pitt/cs/nih/backend/featureVector/Preprocess.java index 11e7365..69e10bb 100755 --- a/src/edu/pitt/cs/nih/backend/featureVector/Preprocess.java +++ b/src/edu/pitt/cs/nih/backend/featureVector/Preprocess.java @@ -267,40 +267,42 @@ public static String[] separatePathologyHeaderFooter(String input) throws Except sb.append("\n"); iLine++; } - // make header - text[0] = TextUtil.removeDuplicatedSpace(sb.toString().trim()); + + // make header + text[0] = TextUtil.removeDuplicatedSpace(sb.toString().trim()); - // content starts from here until meet Pathologist (beginning of a sentence) - sb = new StringBuilder(); - if(iLine < allLines - 1) { - iLine++; -// text += lines[iLine].trim() + "\n"; - sb.append(lines[iLine].trim()); - sb.append("\n"); - } - else { - iLine = -1; - } - iLine++; +// // content starts from here until meet Pathologist (beginning of a sentence) +// sb = new StringBuilder(); +// if(iLine < allLines - 1) { +// iLine++; +// // text += lines[iLine].trim() + "\n"; +// sb.append(lines[iLine].trim()); +// sb.append("\n"); +// } +// else { +// iLine = -1; +// } +// iLine++; - Pattern p = Pattern.compile("^\\s*Pathologist"); - Matcher m = p.matcher(lines[iLine]); - while(iLine < allLines && !m.find()) { - if(!lines[iLine].trim().equals("")) { -// text += lines[iLine].trim() + "\n"; - sb.append(lines[iLine].trim()); - sb.append("\n"); - } - iLine++; - m = p.matcher(lines[iLine]); - } - // skip until meet GROSS DESCRIPTION - p = Pattern.compile("^GROSS DESCRIPTION"); - m = p.matcher(lines[iLine]); - while(iLine < allLines && !m.find()) { - iLine++; - m = p.matcher(lines[iLine]); - } +// Pattern p = Pattern.compile("^\\s*Pathologist"); +// Matcher m = p.matcher(lines[iLine]); +// while(iLine < allLines && !m.find()) { +// if(!lines[iLine].trim().equals("")) { +// // text += lines[iLine].trim() + "\n"; +// sb.append(lines[iLine].trim()); +// sb.append("\n"); +// } +// iLine++; +// m = p.matcher(lines[iLine]); +// } +// // skip until meet GROSS DESCRIPTION +// p = Pattern.compile("^GROSS DESCRIPTION"); +// m = p.matcher(lines[iLine]); +// while(iLine < allLines && !m.find()) { +// iLine++; +// m = p.matcher(lines[iLine]); +// } + // keep contain until meet E_O_R while(iLine < allLines && lines[iLine].indexOf("E_O_R") == -1) { if(!lines[iLine].trim().equals("")) { @@ -310,22 +312,23 @@ public static String[] separatePathologyHeaderFooter(String input) throws Except } iLine++; } - // make content + + // make content // remove stop word - // and **ID-NUM - text[1] = TextUtil.removeDuplicatedSpace(sb.toString().trim()); + // and **ID-NUM + text[1] = TextUtil.removeDuplicatedSpace(sb.toString().trim()); text[1] = text[1].replaceAll("\\*\\*ID\\-NUM", ""); text[1] = text[1].replaceAll("\\*\\*INITIALS", ""); text[1] = text[1].replaceAll("_{3,}", ""); - // footer starts from here to the end - sb = new StringBuilder(); - while(iLine < allLines) { - sb.append(lines[iLine++].trim()); - sb.append("\n"); - } - // make the footer - text[2] = TextUtil.removeDuplicatedSpace(sb.toString().trim()); + // footer starts from here to the end + sb = new StringBuilder(); + while(iLine < allLines) { + sb.append(lines[iLine++].trim()); + sb.append("\n"); + } + // make the footer + text[2] = TextUtil.removeDuplicatedSpace(sb.toString().trim()); return text; } diff --git a/src/edu/pitt/cs/nih/backend/feedback/TextFileFeedbackManager_LibSVM_WordTree.java b/src/edu/pitt/cs/nih/backend/feedback/TextFileFeedbackManager_LibSVM_WordTree.java index 399e1bd..b4452af 100644 --- a/src/edu/pitt/cs/nih/backend/feedback/TextFileFeedbackManager_LibSVM_WordTree.java +++ b/src/edu/pitt/cs/nih/backend/feedback/TextFileFeedbackManager_LibSVM_WordTree.java @@ -981,13 +981,13 @@ public String wordTreeSkippedNGramPatternString(Map spanMap) patternStr = sb.toString().trim().replaceAll(whiteSpaceBeforePunc, "\\\\s{0,1}"); // in case the first skipped n-gram is a punctuation // there would be no white space before the n-gram - patternStr = patternStr.replaceAll(" (?=(\\(\\\\S\\+))", "\\\\s*"); + patternStr = patternStr.replaceAll(" (?=(\\(\\\\S\\+))", "\\\\W*"); // // quote the string // patternStr = TextUtil.escapeRegex(patternStr); // reverse 's patternStr = patternStr.replaceAll("'s", "' {0,1}s"); // replace whitespace by \s - patternStr = patternStr.replaceAll("\\s(?!\\{)", "\\\\s+"); + patternStr = patternStr.replaceAll("\\s(?!\\{)", "\\\\W+"); // System.out.println("Search pattern: " + patternStr);