Remove Unwanted Mentions Programmatically

@Mentions

# python
import re
def remove_at_mentions(line):
    return re.sub("@\w+", "", line)
def scrub_text(text):
    scrubbed_text = remove_at_mentions(text).strip()
    return scrubbed_text

scrubbed_text = scrub_text("@dogwalker I hope you had a nice long walk!")

URLs

# python
# Regex from https://gist.github.com/dperini/729294
import re
def remove_urls(line):
  URL_REGEX = "(?:(?:(?:https?|ftp):)?\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?[a-z0-9\u00a1-\uffff]\.)+(?:[a-z\u00a1-\uffff]{2,}\.?))(?::\d{2,5})?(?:[/?#]\S*)?"
  return re.sub(URL_REGEX, "", line)

remove_urls("have you visited https://www.receptiviti.com/company to find out more")

Email headers and Email Metadata

# python
from email import message_from_string
def extract_body_from_email(text):
    msg = message_from_string(text)
    if msg.is_multipart():
        for part in msg.walk():
            content_type = part.get_content_type()
            content_disposition = str(part.get('Content-Disposition'))
            found_body = False
            if content_type == 'text/plain' and 'attachment' not in content_disposition and not found_body:
                body = part.get_payload(decode=True)
    else:
        body = msg.get_payload(decode=True)
    return body

    text =  """Delivered-To: [email protected]
    Received: by 2002:a05:6000:1188:0:0:0:0 with SMTP id g8csp2511771wrx;
            Mon, 14 Feb 2024 05:11:19 -0700 (PDT)
    MIME-Version: 1.0
    In-Reply-To: <[email protected]>
    From: Test User <[email protected]>
    Date: on, 14 Feb 2024 08:11:06 -0400
    Subject: Fwd: Spam email from you
    To: All full-time employees <[email protected]>
    Content-Type: multipart/related; boundary="000000000000754a2e05af44eeac"
    --000000000000754a2e05af44eeac
    Content-Type: multipart/alternative; boundary="000000000000754a2c05af44eeab"

    --000000000000754a2c05af44eeab
    Content-Type: text/plain; charset="UTF-8"

    As part of our ongoing security awareness, if you see emails like this,
    please mark them as phishing.
    Note the warning signs - the actual email address doesn't match sender, the
    signature isn't right, and there's a giant red warning banner :)

    [image: Screen Shot 2020-09-14 at 9.24.41 am.png]

    Marking something as phishing is slightly different than using the spam
    button in gmail.
    The email goes through different processes / to a different team and helps
    Google prevent these from landing in our inboxes.

    Thanks all!
    *Test User* Desig, Nation | Head, Software @ Receptiviti |
    [email protected]

    This message is intended only for the use of the intended recipients, and
    it may be privileged and confidential. If you are not the intended
    recipient, you are hereby notified that any review, re-transmission,
    conversion to hard copy, copying, circulation or other use of this message
    is strictly prohibited and may be illegal. If you are not the intended
    recipient, please notify me immediately by return email and delete this
    message from your system. Thank you.

    --000000000000754a2c05af44eeab
    Content-Type: text/html; charset="UTF-8"
    Content-Transfer-Encoding: quoted-printable

    <div dir=3D"ltr"><div>As part of our ongoing security awareness, if you see
     emails like this, please mark them as phishing. <br></div><div>Note the wa
    rning signs - the actual email address doesn&#39;t match sender, the signat
    ure isn&#39;t right, and there&#39;s a giant red warning banner :)</div><div>
    <br></div><div><img alt=3D"Screen Shot 2020-09-14 at 9.24.41 am.png" src
    =3D"cid:1748c828c6b7ef93e481" width=3D"542" height=3D"309"></div><div><font face=3D"
    Times"><span style=3D"font-size:12px">This message is intended only for the use of the
    intended recipients, and it may be privileged and confidential. If you are not the intended
    recipient, you are hereby notified that any review, re-transmission,
    conversion to hard copy, copying, circulation or other use of this message
    is strictly prohibited and may be illegal. If you are not the intended
    recipient, please notify me immediately by return email and delete this
    message from your system. Thank you.</span></font></div></div>

    --000000000000754a2c05af44eeab--
    --000000000000754a2e05af44eeac
    Content-Type: image/png; name="Screen Shot 2020-09-14 at 9.24.41 am.png"
    Content-Disposition: inline; filename="Screen Shot 2020-09-14 at 9.24.41 am.png"
    Content-Transfer-Encoding: base64
    Content-ID: <1748c828c6b7ef93e481>
    X-Attachment-Id: 1748c828c6b7ef93e481

    --000000000000754a2e05af44eeac--"""

extract_body_from_email(text)

HTML

# python
# pip3 install bs4
from bs4 import BeautifulSoup
def strip_html_basic(message_string, parser="lxml-xml"):
    soup = BeautifulSoup(message_string, parser)
    for tag in soup("style"):
        tag.decompose()
    plain = soup.get_text("\n", strip=True)
    return plain

html_doc = """
<html><head><title>The Receptiviti Story</title></head>
<body>
<p class="story">Every word counts...
<a href="http://receptiviti.com/i" class="link">I</a>,
<a href="http://receptiviti.com/fountain" class="link">Fountain</a> and
<a href="http://receptiviti.com/puppy" class="link">Puppy</a>;
When language first emerged, they were not made equally.</p>
<p class="story">...</p>
"""
strip_html_basic(html_doc)

@Mentions​

URLs​

Email headers and Email Metadata​

HTML​

@Mentions

URLs

Email headers and Email Metadata

HTML