반응형
큰 txt파일을 읽으면 메모리 문제 발생하기때문에 분할해서 단어장 처리.
import os
import sys
import konlpy
import pandas as pd
import numpy as np
os.environ['JAVA_OPTS'] = 'Xmx4096M'
import itertools
import mr #local module
file_name = "test_export_mentions_2020-11-17_title.txt"
#file_name = "test_export_mentions_2020-11-17_title_utf8.txt" #test
file_out = "outputfile"
lines_tot = mr.file_len(file_name)
filesize = mr.getfilesize(file_name) * 1000
print("파일명 : ", file_name)
print("줄 개수 : ", lines_tot)
print("파일사이즈 : ", filesize)
f = open(file_name,'r', encoding='utf-8')
numbits = 1000000
loop_num = round(os.stat(file_name).st_size/numbits+1)+1
print(os.stat(file_name).st_size/numbits+1)
print(loop_num)
for i in range(0, loop_num):
o = open('./input/'+file_out+str(i)+'.txt','w', encoding='utf-8')
segment = f.readlines(numbits)
for c in range(0,len(segment)):
o.write(segment[c]+"\n")
o.close()
import itertools
def f_append(text):
sign = 'N'
#기존 파일의 단어를 가져와서 신규 단어가 있는지 확인
with open('./replace_word.txt','r',encoding='utf-8') as f:
list_word = f.read().strip().split('\n')
for line in list_word:
if line == text:
sign = 'Exist'
#print('Exist')
'''
list_word = f.read()
if list_word.find(text) >=0:
sign = 'N'
print('Exist')
'''
if sign == 'N':
#기존 파일에 단어추가
with open('./replace_word.txt', 'a', encoding='utf-8') as myfile:
myfile.write(text)
myfile.write('\n')
sign = 'Yes'
return sign
def f_list():
#단어파일을 list로 리턴
with open('./replace_word.txt','r',encoding='utf-8') as f:
list_word = f.read().strip().split('\n')
return list_word
def f_del(text):
#입력받은 단어를 삭제
sign = 'N'
matrix = []
with open('./replace_word.txt','r',encoding='utf-8') as f:
dic = f.read().strip().split('\n')
for word in dic:
if word != text:
matrix.append(word)
else:
sign = 'Del';
print(sign)
print(dic)
if sign == 'Del':
with open('./replace_word.txt', 'w', encoding='utf-8') as myfile:
#myfile.write(matrix)
for line2 in matrix:
print(line2)
myfile.write(line2)
myfile.write('\n')
sign = 'Y'
return sign
반응형