-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathsentitext.php
115 lines (96 loc) · 3.43 KB
/
sentitext.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
<?php
/*
Identify sentiment-relevant string-level properties of input text.
*/
const PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"",
"!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"];
class SentiText {
private $text = "";
public $words_and_emoticons = null;
public $is_cap_diff = null;
function __construct($text){
//checking that is string
//if (!isinstance(text, str)){
// text = str(text.encode('utf-8'));
//}
$this->text = $text;
$this->words_and_emoticons = $this->_words_and_emoticons();
// doesn't separate words from\
// adjacent punctuation (keeps emoticons & contractions)
$this->is_cap_diff = $this->allcap_differential($this->words_and_emoticons);
}
/*
Remove all punctation from a string
*/
function strip_punctuation($string) {
//$string = strtolower($string);
return preg_replace("/[[:punct:]]+/", "", $string);
}
function array_count_values_of($haystack, $needle) {
if(!in_array($needle,$haystack)){
return 0;
}
$counts = array_count_values($haystack);
return $counts[$needle];
}
/*
Check whether just some words in the input are ALL CAPS
:param list words: The words to inspect
:returns: `True` if some but not all items in `words` are ALL CAPS
*/
private function allcap_differential($words){
$is_different = false;
$allcap_words = 0;
foreach($words as $word){
//ctype is affected by the local of the processor see manual for more details
if(ctype_upper($word)){
$allcap_words += 1;
}
}
$cap_differential = count($words) - $allcap_words;
if ($cap_differential > 0 && $cap_differential < count($words)){
$is_different = true;
}
return $is_different;
}
function _words_only(){
$text_mod = $this->strip_punctuation($this->text);
// removes punctuation (but loses emoticons & contractions)
$words_only = preg_split('/\s+/',$text_mod);
# get rid of empty items or single letter "words" like 'a' and 'I'
$works_only = array_filter($words_only,function($word){ return strlen($word) > 1; });
return $words_only;
}
function _words_and_emoticons(){
$wes = preg_split('/\s+/',$this->text);
# get rid of residual empty items or single letter words
$wes = array_filter($wes,function($word){ return strlen($word) > 1; });
//Need to remap the indexes of the array
$wes = array_values ($wes);
$words_only = $this->_words_only();
foreach($words_only as $word){
foreach(PUNC_LIST as $punct){
//replace all punct + word combinations with word
$pword = $punct .$word;
$x1 = $this->array_count_values_of($wes,$pword);
while ($x1 > 0){
$i = array_search($pword,$wes);
unset($wes[$i]);
array_splice($wes,$i,0,$word);
$x1 = $this->array_count_values_of($wes,$pword);
}
//Do the same as above but word then punct
$wordp = $word . $punct;
$x2 = $this->array_count_values_of($wes, $wordp);
while ($x2 > 0){
$i = array_search($wordp, $wes);
unset($wes[$i]);
array_splice($wes,$i,0,$word);
$x2 = $this->array_count_values_of($wes, $wordp);
}
}
}
return $wes;
}
}
?>