-
Notifications
You must be signed in to change notification settings - Fork 87
PHP Rapid Automatic Keyword Extraction
yooper edited this page Sep 6, 2016
·
2 revisions
PHP Rapid Automatic Keyword Extraction PHP Rapid Automatic Keyword Extraction PHP Rapid Automatic Keyword Extraction
use TextAnalysis\Analysis\Keywords\Rake;
use TextAnalysis\Documents\TokensDocument;
use TextAnalysis\Tokenizers\WhitespaceTokenizer;
use StopWordFactory;
use TextAnalysis\Filters;
class GetKeywords
{
const NGRAM_SIZE = 3;
/**
* @var \TextAnalysis\Interfaces\ITokenTransformation[]
*/
protected $tokenFilters = [];
/**
* @var \TextAnalysis\Interfaces\ITokenTransformation[]
*/
protected $contentFilters = [];
public function get($content)
{
foreach($this->getContentFilters() as $contentFilter)
{
$content = $contentFilter->transform($content);
}
return $this->getKeywordScores($content);
}
/**
*
* @return \TextAnalysis\Interfaces\ITokenTransformation[]
*/
public function getContentFilters()
{
if(empty($this->contentFilters)) {
$lambdaFunc = function($word){
return preg_replace('/[\x00-\x1F\x80-\xFF]/u', ' ', $word);
};
$this->contentFilters = [
new Filters\StripTagsFilter(),
new Filters\LowerCaseFilter(),
new Filters\NumbersFilter(),
new Filters\EmailFilter(),
new Filters\UrlFilter(),
new Filters\PossessiveNounFilter(),
new Filters\QuotesFilter(),
new Filters\PunctuationFilter(),
new Filters\CharFilter(),
new Filters\LambdaFilter($lambdaFunc),
new Filters\WhitespaceFilter()
];
}
return $this->contentFilters;
}
/**
*
* @return \TextAnalysis\Interfaces\ITokenTransformation[]
*/
public function getTokenFilters()
{
if(empty($this->tokenFilters)) {
$stopwords = StopWordFactory::get('stop-words-fox.txt');
$this->tokenFilters = [
new Filters\StopWordsFilter($stopwords),
];
}
return $this->tokenFilters;
}
/**
*
* @param string $content
* @return array
*/
public function getKeywordScores($content)
{
$tokens = (new WhitespaceTokenizer())->tokenize($content);
$tokenDoc = new TokensDocument(array_map('strval', $tokens));
unset($tokens);
foreach($this->getTokenFilters() as $filter)
{
$tokenDoc->applyTransformation($filter, false);
}
$size = count($tokenDoc->toArray());
if($size < self::NGRAM_SIZE) {
return [];
}
$rake = new Rake($tokenDoc, self::NGRAM_SIZE);
return $rake->getKeywordScores();
}
}