Python: NLTK stopwords: Difference between revisions
From OnnoCenterWiki
Jump to navigationJump to search
Onnowpurbo (talk | contribs) Created page with "Directory ~/nltk_data/corpora/stopwords" |
Onnowpurbo (talk | contribs) |
||
| (6 intermediate revisions by the same user not shown) | |||
| Line 2: | Line 2: | ||
~/nltk_data/corpora/stopwords | ~/nltk_data/corpora/stopwords | ||
Script untuk scan apakah stopwords yang kita inginkan bekerja dengan baik | |||
import os,nltk,os.path,re,string | |||
import argparse | |||
from nltk.stem.porter import PorterStemmer | |||
ps=PorterStemmer() | |||
def parse_args(): | |||
parser = argparse.ArgumentParser() | |||
parser.add_argument('-i', '--infile', default='', help='input filename') | |||
return parser.parse_args() | |||
args = parse_args() | |||
infile = args.infile | |||
filename = open(infile,'r') | |||
fcontent=filename.read() | |||
filename.close() | |||
fs = fcontent.split() | |||
wordlist=[] | |||
for word in fs: | |||
word = ps.stem(word.strip(string.punctuation).lower()) | |||
if word not in nltk.corpus.stopwords.words('english') and len(word)<15: | |||
if word not in nltk.corpus.stopwords.words('indonesia') and len(word)<15: | |||
if word not in wordlist: | |||
wordlist.append(word) | |||
print( word ) | |||
else: | |||
pass | |||
else: | |||
pass | |||
Masukan kata2 yang tidak ingin ada dalam text ke dalam file | |||
~/nltk_data/corpora/stopwords/indonesia | |||
contoh | |||
saya | |||
punya | |||
sendiri | |||
kami | |||
kamu | |||
anda | |||
dia | |||
mereka | |||
jika | |||
yang | |||
itu | |||
siapa | |||
dengan | |||
a | |||
b | |||
c | |||
d | |||
e | |||
f | |||
.. | |||
.. | |||
1 | |||
2 | |||
3 | |||
4 | |||
5 | |||
.. | |||
.. | |||
01/1/2017 | |||
02/1/2017 | |||
03/1/2017 | |||
04/1/2017 | |||
05/1/2017 | |||
.. | |||
.. | |||
00:00 | |||
00:01 | |||
00:02 | |||
00:03 | |||
00:04 | |||
==Jika sudah ada stopword== | |||
misalnya, | |||
rm ~/nltk_data/corpora/stopwords/indonesia | |||
touch ~/nltk_data/corpora/stopwords/indonesia | |||
cat indonesia-id1 >> ~/nltk_data/corpora/stopwords/indonesia | |||
cat indonesia-angka >> ~/nltk_data/corpora/stopwords/indonesia | |||
cat indonesia-jam >> ~/nltk_data/corpora/stopwords/indonesia | |||
cat indonesia-politik >> ~/nltk_data/corpora/stopwords/indonesia | |||
cat indonesia-common >> ~/nltk_data/corpora/stopwords/indonesia | |||
cat indonesia-1common >> ~/nltk_data/corpora/stopwords/indonesia | |||
cat indonesia-tambahan >> ~/nltk_data/corpora/stopwords/indonesia | |||
Latest revision as of 22:25, 4 February 2017
Directory
~/nltk_data/corpora/stopwords
Script untuk scan apakah stopwords yang kita inginkan bekerja dengan baik
import os,nltk,os.path,re,string
import argparse
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--infile', default=, help='input filename')
return parser.parse_args()
args = parse_args()
infile = args.infile
filename = open(infile,'r')
fcontent=filename.read()
filename.close()
fs = fcontent.split()
wordlist=[]
for word in fs:
word = ps.stem(word.strip(string.punctuation).lower())
if word not in nltk.corpus.stopwords.words('english') and len(word)<15:
if word not in nltk.corpus.stopwords.words('indonesia') and len(word)<15:
if word not in wordlist:
wordlist.append(word)
print( word )
else:
pass
else:
pass
Masukan kata2 yang tidak ingin ada dalam text ke dalam file
~/nltk_data/corpora/stopwords/indonesia
contoh
saya punya sendiri kami kamu anda dia mereka jika yang itu siapa dengan a b c d e f .. .. 1 2 3 4 5 .. .. 01/1/2017 02/1/2017 03/1/2017 04/1/2017 05/1/2017 .. .. 00:00 00:01 00:02 00:03 00:04
Jika sudah ada stopword
misalnya,
rm ~/nltk_data/corpora/stopwords/indonesia touch ~/nltk_data/corpora/stopwords/indonesia cat indonesia-id1 >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-angka >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-jam >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-politik >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-common >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-1common >> ~/nltk_data/corpora/stopwords/indonesia cat indonesia-tambahan >> ~/nltk_data/corpora/stopwords/indonesia