Skip to content

Commit

Permalink
Merge pull request #2 from srdjanmarjanovic/impromevent/parsers-and-c…
Browse files Browse the repository at this point in the history
…leaners-improved

Improve parse and add test email sources
  • Loading branch information
ilijastuden committed Dec 16, 2015
2 parents 5ac0f0b + 20cb774 commit 1ec1220
Show file tree
Hide file tree
Showing 31 changed files with 7,125 additions and 127 deletions.
203 changes: 101 additions & 102 deletions composer.lock

Large diffs are not rendered by default.

15 changes: 9 additions & 6 deletions src/ActiveCollab/EmailReplyExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@ final class EmailReplyExtractor
const GENERIC = 'Generic';
const GOOGLE_MAIL = 'GoogleMail';
const ANDROID_MAIL = 'AndroidMail';
const HOTMAIL = 'Hotmail';
const HUSHMAIL = 'Hushmail';
const IOS = 'iOS';
const OUTLOOK = 'Outlook';
const OUTLOOK_EXPRESS = 'OutlookExpress';
const YAHOO = 'Yahoo';
const APPLE_CLOUD_MAIL = 'AppleCloudMail';
const MAIL_RU_MAIL = 'MailRuMail';

/**
* Parse input file and return reply
Expand All @@ -32,7 +33,7 @@ public static function extractReplyEml($path)

$extractor = self::getExtractorEml(self::detectMailer(self::getHeadersRelevantForMailerDetectionEml($parser)), $parser);

return (string) $extractor->body;
return trim($extractor->body);
}

/**
Expand All @@ -47,15 +48,15 @@ public static function extractReply($headers, $body)
$mailer = self::detectMailer(self::getHeadersRelevantForMailerDetection($headers));
$extractor = self::getExtractor($mailer, $body);

return [$extractor->body,$mailer];
return [trim($extractor->body),$mailer];
}

/**
* @param string $mailer
* @param Parser $parser
* @return Extractor
*/
private function getExtractorEml($mailer, Parser &$parser)
private static function getExtractorEml($mailer, Parser &$parser)
{
$class_name = "ActiveCollab\\EmailReplyExtractor\\Extractor\\{$mailer}Extractor";

Expand Down Expand Up @@ -87,6 +88,8 @@ public static function detectMailer(array $headers)
return self::IOS;
} else if (strpos($headers['x-mailer'], 'Microsoft Office Outlook') !== false || strpos($headers['x-mailer'], 'Microsoft Outlook 14.') !== false || strpos($headers['x-mailer'], 'Microsoft Windows Live Mail') !== false) {
return self::OUTLOOK;
} else if (strpos($headers['x-mailer'], 'Outlook Express') !== false) {
return self::OUTLOOK_EXPRESS;
} else if (strpos($headers['x-mailer'], 'YahooMail') !== false) {
return self::YAHOO;
} else if (strpos($headers['x-mailer'], 'Apple Mail') !== false) {
Expand All @@ -106,9 +109,9 @@ public static function detectMailer(array $headers)
return self::APPLE_CLOUD_MAIL;
} else if (strpos($headers['message-id'], '@email.android.com') !== false) {
return self::ANDROID_MAIL;
} else if (strpos($headers['message-id'], 'i.mail.ru') !== false) {
return self::MAIL_RU_MAIL;
}
} else if (isset($headers['received']) && strpos($headers['received'], 'hotmail.com') !== false) {
return self::HOTMAIL;
} else if (isset($headers['mime-version']) && strpos($headers['mime-version'], 'Apple Message framework') !== false) {
return self::APPLE_MAIL;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,37 @@
<?php
namespace ActiveCollab\EmailReplyExtractor\Extractor;


/**
* @package ActiveCollab\EmailReplyExtractor\Extractor
*/
final class AndroidMailExtractor extends Extractor
{
/**
* Extract Reply from Android mail
*/
protected function processLines()
{
parent::processLines();

list ($unwanted_text, $cut_line) = self::getLinesFromEnd(1);
$unwanted_text = implode(null, $unwanted_text);

// strip 'first name last name wrote:'
if (preg_match('/(.*?)wrote:/is', $unwanted_text)) {
$this->body = array_splice($this->body, 0, $cut_line);
}

// default signature
$match_string = '^sent from(.*?)';
// strip default signature
if ($match_string) {
list ($default_signature, $cut_line) = self::getLinesFromEnd(1);
$default_signature = implode(null, $default_signature);
if (preg_match('/' . $match_string . '/is', $default_signature)) {
$this->body= array_splice($this->body, 0, $cut_line);
}
}

$this->stripSignature();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,16 @@ protected function processLines()
$this->stripSignature();
$this->convertPlainTextQuotesToBlockquotes();
}

/**
* Return original message splitters
*
* @return array
*/
protected function getOriginalMessageSplitters()
{
return array_merge(parent::getOriginalMessageSplitters(), [
'/\-------------------------/is',
]);
}
}
18 changes: 13 additions & 5 deletions src/ActiveCollab/EmailReplyExtractor/Extractor/Extractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -66,20 +66,22 @@ protected function processLines()
*/
public function joinLines()
{
$this->body = trim(implode("\n", $this->body));
// ltrim() strips BOM characters
$this->body = ltrim(trim(implode("\n", $this->body), "\xEF\xBB\xBF"));
}

/**
* Return original message splitters
*
* @todo
* @return array
*/
protected function getOriginalMessageSplitters()
{
return [
//'On Thursday, October 15, 2015 12:50 PM, owner (Active Collab) wrote:',
'/On(.*?)wrote\:(.*?)/is',
'/^Am(.*?)schrieb(.*?)/is',
// '----- Forwarded Message -----',
'- Reply above this line to leave a comment -',
'-- REPLY ABOVE THIS LINE --',
'-- REPLY ABOVE THIS LINE',
Expand Down Expand Up @@ -306,6 +308,7 @@ protected function &getParser()
static function toPlainText($html) {
$plain = (string) $html;


// strip slashes
$plain = (string) trim(stripslashes($plain));

Expand Down Expand Up @@ -364,10 +367,13 @@ static function toPlainText($html) {
// elements that convert to single newline
$plain = (string) preg_replace(array('/<br[^>]*>/i', '/(<tr[^>]*>|<\/tr>)/i'), "\n", $plain); // <br> <tr>

// div elements
$plain = (string) preg_replace('/<div[^>]*>(.*?)<\/div>/i', "\\1\n", $plain); // <div>with content</div>

// images
$plain = (string) preg_replace(array('/<img\s+[^>]*src="([^"]*)"[^>]*>/i'), "[Image: \\1]", $plain); // <br> <tr>

// <hr> converts to -----------------------
// <hr> converts to --------------------//---
$plain = (string) preg_replace('/<hr[^>]*>/i', "\n-------------------------\n", $plain); // <hr>

// other table tags
Expand All @@ -392,7 +398,7 @@ static function toPlainText($html) {
} else {
return $text;
}
}, $plain); // <li />
}, $plain); // <a href="$url">$text</a>

// handle blockquotes
$plain = (string) preg_replace_callback('/<blockquote[^>]*>(.*?)<\/blockquote>/is', function ($blockquote_content) {
Expand All @@ -408,10 +414,12 @@ static function toPlainText($html) {
return "\n\n" . implode("\n", $return) . "\n\n";
}, $plain);

$plain = (string) preg_replace('/<title[^>]*>(.*?)<\/title>/i', "", $plain); // remove unnecessary title tag

// strip other tags
$plain = (string) strip_tags($plain);

// clean up unneccessary newlines
// clean up unnecessary newlines
$plain = (string) preg_replace("/\n\s+\n/", "\n\n", $plain);
$plain = (string) preg_replace("/[\n]{3,}/", "\n\n", $plain);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,15 @@
*/
final class GenericExtractor extends Extractor
{
/**
* Return splitters
*
* @return array
*/
protected function getOriginalMessageSplitters()
{
return array_merge(parent::getOriginalMessageSplitters(), [
'/\-------------------------/is',
]);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?php
namespace ActiveCollab\EmailReplyExtractor\Extractor;

/**
* Class MailRuMail
*
* @package ActiveCollab\EmailReplyExtractor\Extractor
*/
final class MailRuMailExtractor extends Extractor
{
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?php
namespace ActiveCollab\EmailReplyExtractor\Extractor;

/**
* @package ActiveCollab\EmailReplyExtractor\Extractor
*/
final class OutlookExpressExtractor extends Extractor
{

/**
* Return original message splitters
*
* @return array
*/
protected function getOriginalMessageSplitters()
{
return array_merge(parent::getOriginalMessageSplitters(), [
'/\-------------------------/is',
]);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,37 @@
*/
final class OutlookExtractor extends Extractor
{
public function processLines()
{
parent::processLines();
self::stripSignature();
}

/**
* @param string $html
*
* @return string
* Overrides Extractor::stripSignature()
*/
static function toPlainText($html)
public function stripSignature()
{
$html = str_replace('div', 'p', $html);
for ($x = 0, $lines_count = count($this->body); $x < $lines_count; $x++) {
$line = trim($this->body[(($lines_count - $x) - 1)]);

if ($line && trim($line)) {
if ($line == "-- " || $line == "--" || substr($line, 0, strlen('-- ')) == '-- ') {
$this->body = array_splice($this->body, 0, (($lines_count - $x) - 1));
return;
}

return parent::toPlainText($html);
// Should signature be longer than 8 lines?
if ($x > 8) {
return;
}
}
}
}

/**
* Return original message splitters
*
* @todo
* @return array
*/
protected function getOriginalMessageSplitters()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ final class YahooExtractor extends Extractor
protected function getOriginalMessageSplitters()
{
return array_merge(parent::getOriginalMessageSplitters(), [
'/On(.*?)wrote\:(.*?)/is'
'/\-------------------------/is',
]);
}

Expand Down
17 changes: 17 additions & 0 deletions src/ActiveCollab/EmailReplyExtractor/Extractor/iOSExtractor.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,21 @@
*/
final class iOSExtractor extends Extractor
{
protected function processLines()
{
parent::processLines();

// default signature
$match_string = '^sent from(.*?)';
// strip default signature
if ($match_string) {
list ($default_signature, $cut_line) = self::getLinesFromEnd(1);
$default_signature = implode(null, $default_signature);
if (preg_match('/' . $match_string . '/is', $default_signature)) {
$this->body= array_splice($this->body, 0, $cut_line);
}
}

$this->stripSignature();
}
}
1 change: 0 additions & 1 deletion test/example_messages/.gitignore
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
*.eml
Loading

0 comments on commit 1ec1220

Please sign in to comment.