Skip to main content

Remove Unwanted Mentions Programmatically

@Mentions

# python
import re
def remove_at_mentions(line):
return re.sub("@\w+", "", line)
def scrub_text(text):
scrubbed_text = remove_at_mentions(text).strip()
return scrubbed_text

scrubbed_text = scrub_text("@dogwalker I hope you had a nice long walk!")

URLs

# python
# Regex from https://gist.github.com/dperini/729294
import re
def remove_urls(line):
URL_REGEX = "(?:(?:(?:https?|ftp):)?\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?[a-z0-9\u00a1-\uffff]\.)+(?:[a-z\u00a1-\uffff]{2,}\.?))(?::\d{2,5})?(?:[/?#]\S*)?"
return re.sub(URL_REGEX, "", line)

remove_urls("have you visited https://www.receptiviti.com/company to find out more")

Email headers and Email Metadata

# python
from email import message_from_string
def extract_body_from_email(text):
msg = message_from_string(text)
if msg.is_multipart():
for part in msg.walk():
content_type = part.get_content_type()
content_disposition = str(part.get('Content-Disposition'))
found_body = False
if content_type == 'text/plain' and 'attachment' not in content_disposition and not found_body:
body = part.get_payload(decode=True)
else:
body = msg.get_payload(decode=True)
return body

text = """Delivered-To: [email protected]
Received: by 2002:a05:6000:1188:0:0:0:0 with SMTP id g8csp2511771wrx;
Mon, 14 Feb 2024 05:11:19 -0700 (PDT)
MIME-Version: 1.0
In-Reply-To: <[email protected]>
From: Test User <[email protected]>
Date: on, 14 Feb 2024 08:11:06 -0400
Subject: Fwd: Spam email from you
To: All full-time employees <[email protected]>
Content-Type: multipart/related; boundary="000000000000754a2e05af44eeac"
--000000000000754a2e05af44eeac
Content-Type: multipart/alternative; boundary="000000000000754a2c05af44eeab"

--000000000000754a2c05af44eeab
Content-Type: text/plain; charset="UTF-8"

As part of our ongoing security awareness, if you see emails like this,
please mark them as phishing.
Note the warning signs - the actual email address doesn't match sender, the
signature isn't right, and there's a giant red warning banner :)

[image: Screen Shot 2020-09-14 at 9.24.41 am.png]

Marking something as phishing is slightly different than using the spam
button in gmail.
The email goes through different processes / to a different team and helps
Google prevent these from landing in our inboxes.

Thanks all!
*Test User* Desig, Nation | Head, Software @ Receptiviti |
[email protected]

This message is intended only for the use of the intended recipients, and
it may be privileged and confidential. If you are not the intended
recipient, you are hereby notified that any review, re-transmission,
conversion to hard copy, copying, circulation or other use of this message
is strictly prohibited and may be illegal. If you are not the intended
recipient, please notify me immediately by return email and delete this
message from your system. Thank you.

--000000000000754a2c05af44eeab
Content-Type: text/html; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable

<div dir=3D"ltr"><div>As part of our ongoing security awareness, if you see
emails like this, please mark them as phishing. <br></div><div>Note the wa
rning signs - the actual email address doesn&#39;t match sender, the signat
ure isn&#39;t right, and there&#39;s a giant red warning banner :)</div><div>
<br></div><div><img alt=3D"Screen Shot 2020-09-14 at 9.24.41 am.png" src
=3D"cid:1748c828c6b7ef93e481" width=3D"542" height=3D"309"></div><div><font face=3D"
Times"><span style=3D"font-size:12px">This message is intended only for the use of the
intended recipients, and it may be privileged and confidential. If you are not the intended
recipient, you are hereby notified that any review, re-transmission,
conversion to hard copy, copying, circulation or other use of this message
is strictly prohibited and may be illegal. If you are not the intended
recipient, please notify me immediately by return email and delete this
message from your system. Thank you.</span></font></div></div>

--000000000000754a2c05af44eeab--
--000000000000754a2e05af44eeac
Content-Type: image/png; name="Screen Shot 2020-09-14 at 9.24.41 am.png"
Content-Disposition: inline; filename="Screen Shot 2020-09-14 at 9.24.41 am.png"
Content-Transfer-Encoding: base64
Content-ID: <1748c828c6b7ef93e481>
X-Attachment-Id: 1748c828c6b7ef93e481

--000000000000754a2e05af44eeac--"""

extract_body_from_email(text)

HTML

# python
# pip3 install bs4
from bs4 import BeautifulSoup
def strip_html_basic(message_string, parser="lxml-xml"):
soup = BeautifulSoup(message_string, parser)
for tag in soup("style"):
tag.decompose()
plain = soup.get_text("\n", strip=True)
return plain

html_doc = """
<html><head><title>The Receptiviti Story</title></head>
<body>
<p class="story">Every word counts...
<a href="http://receptiviti.com/i" class="link">I</a>,
<a href="http://receptiviti.com/fountain" class="link">Fountain</a> and
<a href="http://receptiviti.com/puppy" class="link">Puppy</a>;
When language first emerged, they were not made equally.</p>
<p class="story">...</p>
"""
strip_html_basic(html_doc)