<?xml version="1.0"?>
<feed xmlns="http://www.w3.org/2005/Atom" xml:lang="en">
	<id>https://lms.onnocenter.or.id/wiki/index.php?action=history&amp;feed=atom&amp;title=Python%3A_NLTK_cleaning_text</id>
	<title>Python: NLTK cleaning text - Revision history</title>
	<link rel="self" type="application/atom+xml" href="https://lms.onnocenter.or.id/wiki/index.php?action=history&amp;feed=atom&amp;title=Python%3A_NLTK_cleaning_text"/>
	<link rel="alternate" type="text/html" href="https://lms.onnocenter.or.id/wiki/index.php?title=Python:_NLTK_cleaning_text&amp;action=history"/>
	<updated>2026-04-20T01:53:22Z</updated>
	<subtitle>Revision history for this page on the wiki</subtitle>
	<generator>MediaWiki 1.45.1</generator>
	<entry>
		<id>https://lms.onnocenter.or.id/wiki/index.php?title=Python:_NLTK_cleaning_text&amp;diff=46924&amp;oldid=prev</id>
		<title>Onnowpurbo at 08:49, 5 February 2017</title>
		<link rel="alternate" type="text/html" href="https://lms.onnocenter.or.id/wiki/index.php?title=Python:_NLTK_cleaning_text&amp;diff=46924&amp;oldid=prev"/>
		<updated>2017-02-05T08:49:59Z</updated>

		<summary type="html">&lt;p&gt;&lt;/p&gt;
&lt;a href=&quot;https://lms.onnocenter.or.id/wiki/index.php?title=Python:_NLTK_cleaning_text&amp;amp;diff=46924&amp;amp;oldid=46793&quot;&gt;Show changes&lt;/a&gt;</summary>
		<author><name>Onnowpurbo</name></author>
	</entry>
	<entry>
		<id>https://lms.onnocenter.or.id/wiki/index.php?title=Python:_NLTK_cleaning_text&amp;diff=46793&amp;oldid=prev</id>
		<title>Onnowpurbo: Created page with &quot; Cleaning Text      01 May 2016 Python Data Wrangling   Create some raw text  # Create a list of three strings. incoming_reports = [&quot;We are attacking on their left flank but a...&quot;</title>
		<link rel="alternate" type="text/html" href="https://lms.onnocenter.or.id/wiki/index.php?title=Python:_NLTK_cleaning_text&amp;diff=46793&amp;oldid=prev"/>
		<updated>2017-01-29T08:40:27Z</updated>

		<summary type="html">&lt;p&gt;Created page with &amp;quot; Cleaning Text      01 May 2016 Python Data Wrangling   Create some raw text  # Create a list of three strings. incoming_reports = [&amp;quot;We are attacking on their left flank but a...&amp;quot;&lt;/p&gt;
&lt;p&gt;&lt;b&gt;New page&lt;/b&gt;&lt;/p&gt;&lt;div&gt;&lt;br /&gt;
Cleaning Text&lt;br /&gt;
&lt;br /&gt;
    01 May 2016 Python Data Wrangling &lt;br /&gt;
&lt;br /&gt;
Create some raw text&lt;br /&gt;
&lt;br /&gt;
# Create a list of three strings.&lt;br /&gt;
incoming_reports = [&amp;quot;We are attacking on their left flank but are losing many men.&amp;quot;, &lt;br /&gt;
               &amp;quot;We cannot see the enemy army. Nothing else to report.&amp;quot;, &lt;br /&gt;
               &amp;quot;We are ready to attack but are waiting for your orders.&amp;quot;]&lt;br /&gt;
&lt;br /&gt;
Seperate by word&lt;br /&gt;
&lt;br /&gt;
# import word tokenizer&lt;br /&gt;
from nltk.tokenize import word_tokenize&lt;br /&gt;
&lt;br /&gt;
# Apply word_tokenize to each element of the list called incoming_reports&lt;br /&gt;
tokenized_reports = [word_tokenize(report) for report in incoming_reports]&lt;br /&gt;
&lt;br /&gt;
# View tokenized_reports&lt;br /&gt;
tokenized_reports&lt;br /&gt;
&lt;br /&gt;
[[&amp;#039;We&amp;#039;,&lt;br /&gt;
  &amp;#039;are&amp;#039;,&lt;br /&gt;
  &amp;#039;attacking&amp;#039;,&lt;br /&gt;
  &amp;#039;on&amp;#039;,&lt;br /&gt;
  &amp;#039;their&amp;#039;,&lt;br /&gt;
  &amp;#039;left&amp;#039;,&lt;br /&gt;
  &amp;#039;flank&amp;#039;,&lt;br /&gt;
  &amp;#039;but&amp;#039;,&lt;br /&gt;
  &amp;#039;are&amp;#039;,&lt;br /&gt;
  &amp;#039;losing&amp;#039;,&lt;br /&gt;
  &amp;#039;many&amp;#039;,&lt;br /&gt;
  &amp;#039;men&amp;#039;,&lt;br /&gt;
  &amp;#039;.&amp;#039;],&lt;br /&gt;
 [&amp;#039;We&amp;#039;,&lt;br /&gt;
  &amp;#039;can&amp;#039;,&lt;br /&gt;
  &amp;#039;not&amp;#039;,&lt;br /&gt;
  &amp;#039;see&amp;#039;,&lt;br /&gt;
  &amp;#039;the&amp;#039;,&lt;br /&gt;
  &amp;#039;enemy&amp;#039;,&lt;br /&gt;
  &amp;#039;army&amp;#039;,&lt;br /&gt;
  &amp;#039;.&amp;#039;,&lt;br /&gt;
  &amp;#039;Nothing&amp;#039;,&lt;br /&gt;
  &amp;#039;else&amp;#039;,&lt;br /&gt;
  &amp;#039;to&amp;#039;,&lt;br /&gt;
  &amp;#039;report&amp;#039;,&lt;br /&gt;
  &amp;#039;.&amp;#039;],&lt;br /&gt;
 [&amp;#039;We&amp;#039;,&lt;br /&gt;
  &amp;#039;are&amp;#039;,&lt;br /&gt;
  &amp;#039;ready&amp;#039;,&lt;br /&gt;
  &amp;#039;to&amp;#039;,&lt;br /&gt;
  &amp;#039;attack&amp;#039;,&lt;br /&gt;
  &amp;#039;but&amp;#039;,&lt;br /&gt;
  &amp;#039;are&amp;#039;,&lt;br /&gt;
  &amp;#039;waiting&amp;#039;,&lt;br /&gt;
  &amp;#039;for&amp;#039;,&lt;br /&gt;
  &amp;#039;your&amp;#039;,&lt;br /&gt;
  &amp;#039;orders&amp;#039;,&lt;br /&gt;
  &amp;#039;.&amp;#039;]]&lt;br /&gt;
&lt;br /&gt;
# Import regex&lt;br /&gt;
import re&lt;br /&gt;
&lt;br /&gt;
# Import string&lt;br /&gt;
import string&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
regex = re.compile(&amp;#039;[%s]&amp;#039; % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html&lt;br /&gt;
&lt;br /&gt;
tokenized_reports_no_punctuation = []&lt;br /&gt;
&lt;br /&gt;
for review in tokenized_reports:&lt;br /&gt;
&lt;br /&gt;
    new_review = []&lt;br /&gt;
    for token in review: &lt;br /&gt;
        new_token = regex.sub(u&amp;#039;&amp;#039;, token)&lt;br /&gt;
        if not new_token == u&amp;#039;&amp;#039;:&lt;br /&gt;
            new_review.append(new_token)&lt;br /&gt;
&lt;br /&gt;
    tokenized_reports_no_punctuation.append(new_review)&lt;br /&gt;
&lt;br /&gt;
tokenized_reports_no_punctuation&lt;br /&gt;
&lt;br /&gt;
[[&amp;#039;We&amp;#039;,&lt;br /&gt;
  &amp;#039;are&amp;#039;,&lt;br /&gt;
  &amp;#039;attacking&amp;#039;,&lt;br /&gt;
  &amp;#039;on&amp;#039;,&lt;br /&gt;
  &amp;#039;their&amp;#039;,&lt;br /&gt;
  &amp;#039;left&amp;#039;,&lt;br /&gt;
  &amp;#039;flank&amp;#039;,&lt;br /&gt;
  &amp;#039;but&amp;#039;,&lt;br /&gt;
  &amp;#039;are&amp;#039;,&lt;br /&gt;
  &amp;#039;losing&amp;#039;,&lt;br /&gt;
  &amp;#039;many&amp;#039;,&lt;br /&gt;
  &amp;#039;men&amp;#039;],&lt;br /&gt;
 [&amp;#039;We&amp;#039;,&lt;br /&gt;
  &amp;#039;can&amp;#039;,&lt;br /&gt;
  &amp;#039;not&amp;#039;,&lt;br /&gt;
  &amp;#039;see&amp;#039;,&lt;br /&gt;
  &amp;#039;the&amp;#039;,&lt;br /&gt;
  &amp;#039;enemy&amp;#039;,&lt;br /&gt;
  &amp;#039;army&amp;#039;,&lt;br /&gt;
  &amp;#039;Nothing&amp;#039;,&lt;br /&gt;
  &amp;#039;else&amp;#039;,&lt;br /&gt;
  &amp;#039;to&amp;#039;,&lt;br /&gt;
  &amp;#039;report&amp;#039;],&lt;br /&gt;
 [&amp;#039;We&amp;#039;,&lt;br /&gt;
  &amp;#039;are&amp;#039;,&lt;br /&gt;
  &amp;#039;ready&amp;#039;,&lt;br /&gt;
  &amp;#039;to&amp;#039;,&lt;br /&gt;
  &amp;#039;attack&amp;#039;,&lt;br /&gt;
  &amp;#039;but&amp;#039;,&lt;br /&gt;
  &amp;#039;are&amp;#039;,&lt;br /&gt;
  &amp;#039;waiting&amp;#039;,&lt;br /&gt;
  &amp;#039;for&amp;#039;,&lt;br /&gt;
  &amp;#039;your&amp;#039;,&lt;br /&gt;
  &amp;#039;orders&amp;#039;]]&lt;br /&gt;
&lt;br /&gt;
Remove filler words&lt;br /&gt;
&lt;br /&gt;
from nltk.corpus import stopwords&lt;br /&gt;
&lt;br /&gt;
tokenized_reports_no_stopwords = []&lt;br /&gt;
for report in tokenized_reports_no_punctuation:&lt;br /&gt;
    new_term_vector = []&lt;br /&gt;
    for word in report:&lt;br /&gt;
        if not word in stopwords.words(&amp;#039;english&amp;#039;):&lt;br /&gt;
            new_term_vector.append(word)&lt;br /&gt;
    tokenized_reports_no_stopwords.append(new_term_vector)&lt;br /&gt;
&lt;br /&gt;
tokenized_reports_no_stopwords&lt;br /&gt;
&lt;br /&gt;
[[&amp;#039;We&amp;#039;, &amp;#039;attacking&amp;#039;, &amp;#039;left&amp;#039;, &amp;#039;flank&amp;#039;, &amp;#039;losing&amp;#039;, &amp;#039;many&amp;#039;, &amp;#039;men&amp;#039;],&lt;br /&gt;
 [&amp;#039;We&amp;#039;, &amp;#039;see&amp;#039;, &amp;#039;enemy&amp;#039;, &amp;#039;army&amp;#039;, &amp;#039;Nothing&amp;#039;, &amp;#039;else&amp;#039;, &amp;#039;report&amp;#039;],&lt;br /&gt;
 [&amp;#039;We&amp;#039;, &amp;#039;ready&amp;#039;, &amp;#039;attack&amp;#039;, &amp;#039;waiting&amp;#039;, &amp;#039;orders&amp;#039;]]&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
&lt;br /&gt;
==Referensi==&lt;br /&gt;
&lt;br /&gt;
* http://chrisalbon.com/python/cleaning_text.html&lt;/div&gt;</summary>
		<author><name>Onnowpurbo</name></author>
	</entry>
</feed>