2013-05-10 2 views
0

재귀를 사용하여 텍스트 정의 경계 마커를 통해 텍스트를 분할하고 원본 텍스트 파일의 모든 구성 부분을 포함하는 목록 및 문자열 목록을 만들려고합니다.재귀 텍스트 분할에 문제가 발생했습니다.

분할이 발생하지 않습니다. 진짜 문제 스크립트 : 여기

짧은 버전

def separate(text,boundary = None): 
    if boundary == None: 
     m = re.findall(r'(?<=boundary=).*',text) 
     i = 0 
     while i < len(m): #have all levels of Boundary/headers named 
      boundary = m[i] 
      textList = recursiveSplit(text,boundary) 
      i += 1 
     pdb.set_trace() 
     return textList 

def recursiveSplit(chunk,boundary): 
    if type(chunk) is types.StringType: 
     ar = re.split(r'(?P<boundary>)(?!--)',chunk) 
     return ar 
    if type(chunk) is types.ListType: 
     i = 0 
     while i < len(chunk): 
      chunk[i] = recursiveSplit(chunk[i],boundary) 
      i += 1 
     return obj 

내가 전에이 스크립트를 게시 한 사람들은 내가 할거야 전체에 게시 나를 싶었

#Textbasics email parser 
#based on a "show original" file converted into text 

from sys import argv 
import re, os, pdb, types 

script, filename = argv 
text = open(filename).read() 
type = "text only" #Set the default type of email 

#cut the email up by sections 
#--A section is defined as any time there are two line breaks in a row 
textList = re.split(r"\n\n", text) 
header = textList[0] 
if re.search(r'MIME-Version',header): 
    type = "MIME" 

# If mail has no attachments, parse as a text-only email 
class Parser(object): 

    def __init__(self,textList): 
     a = 1 
     self.body = "" 
     self.textList = textList 
     self.header = textList[0] 
     while a < len(textList): 
      self.body = self.body + textList[a] + '\n\n' 
      a += 1 

     m = re.search(r'(?<=Subject:).*', self.header) 
     self.subject = m.group(0) 

     m = re.search(r'(?<=From:).*', self.header) 
     self.fromVar = m.group(0) 

     m = re.search(r'(?<=To:).*', self.header) 
     self.toVar = m.group(0) 

     m = re.search(r'(?<=Date:)\w+\s\w+\s\w+', self.header) 
     self.date = m.group(0) 

    def returnParsed(self,descriptor = "all"): 
     if descriptor == "all": 
      retv = "Subject: " + self.subject + "\n" + "From: " + self.fromVar + "\n" + "To: " + self.toVar + "\n" + "Date: " + self.date + "\n" + "\n" + self.body 
      return retv 

     if descriptor == "subject": 
      return self.subject 
     if descriptor == "fromVar": 
      return self.fromVar 
     if descriptor == "toVar": 
      return self.toVar 
     if descriptor == "date": 
      return self.date 
     if descriptor == "body": 
      return self.body 

class MIMEParser(Parser): 

    class MIMEDataDecoder(object): 
     def __init__(self,decodeString,type): 
      pass  


    def __init__(self,textList): 
     self.textList = textList 
     self.nestedItems = [] 
     newItem = NestedItem(self) 
     newItem.setContentType("Header") 
     newItem.setValue(self.textList[0]) 
     self.nestedItems.append(newItem) 
     if re.search(r'(boundary=)',newItem.value): 
      helperItem = NestedItem(self) 
      helperItem.value = (self.textList[0]) 
      m = re.search(r'(?<=Content-Type:).+(?=;)',newItem.value) 
      helperItem.setContentType(m.group(0)) 
      self.nestedItems.append(helperItem) 

     self.organizeData() 
     """i = 0 
     while i < len(self.textList): 
      newItem = NestedItem(self) 
      ct = self.nextContentType 
      newItem.setContentType(ct) 
      newItem.setValue(self.textList[i]) 
      self.nestedItems.append(newItem) 
      m = re.search(r'(?<=Content-Type:).+(?=;)',self.textList[i]) 
      if m: 
       self.nextContentType = m.group(0) 
      i += 1 
      """ 

    def nestItem (self,item): 
     self.nestedItems.append(item) 

    def organizeData(self): 
     self.nestLevel = 1 
     self.currentSuper = self 
     m = re.search(r'(?<=boundary=).*',self.textList[0]) 
     self.currentBoundary = m.group(0) 
     self.currentList = self.textList 
     self.currentList.remove(self.textList[0]) 
     self.formerObjectDatabase = {} 
     pdb.set_trace() 
     while self.nestLevel > 0: 
      i = 0 
      while i < len(self.currentList): 

       boundary = self.currentBoundary 
       #If block is a "normal block", containing a current boundary identifier 
       p = re.search(r'--(?P<boundary>)(?!--)', text) 
       if p: 
        newItem = NestedItem(self.currentSuper) 
        newItem.setValue(self.currentList[i]) 
        r = re.search(r'(?<=Content-Type:).+(?=;)',newItem.value) 
        if r: 
         newItem.setContentType(r.group(0)) 
        self.currentObject = newItem 
        self.currentSuper.nestItem(self.currentObject) 
       #If the block contains a new block boundary 
       m = re.search(r'(?<=boundary=).*',self.currentList[i]) 
       if m: 
        #begin new layer of recursive commands 
        newFormerObject = self.FormerCurrentObject(self.currentList,self.currentSuper,self.currentBoundary) 
        self.formerObjectDatabase[self.nestLevel] = newFormerObject 
        self.currentSuper = self.currentObject 
        self.nestLevel += 1 
        self.currentBoundary = m.group(0) 
        boundary = self.currentBoundary 
        #self.currentList = re.split(r'--(?P<boundary>)(?!--)', self.currentList[i]) 
       boundary = self.currentBoundary 
       #If block contains an "end of boundary" marker 
       q = re.search(r'(?P<boundary>)--', text) 
       if q: 
        self.nestLevel -= 1 
        currentObject = self.formerObjectDatabase[self.nestLevel] 
        self.currentList = currentObject.formerList 
        self.currentSuper = currentObject.formerSuper 
        self.currentBoundary = currentObject.formerBoundary 
       i += 1      


    class FormerCurrentObject: 
     def __init__(self,formerList,formerSuper,formerBoundary): 
      self.formerList = formerList 
      self.formerSuper = formerSuper 
      self.formerBoundary = formerBoundary 




    def printAll(self): 
     print "printing all: %d" % len(self.nestedItems) 
     i = 0 
     while i < len(self.nestedItems): 
      print "printing out item %d" % i 
      self.nestedItems[i].printOut() 
      i += 1 

class NestedItem(object): 
    def __init__(self,superObject,contentType=" ",value = " "): 
     self.superObject = superObject 
     self.contentType = contentType 
     self.value = value 
     self.nestedItems = [] 

    def nestItem(self,item): 
     self.nestedItems.append(item) 

    def printOut(self,printBuffer = ""): 
     print printBuffer + '++%s' % self.contentType 
     print printBuffer + self.value 
     a = 0 
     printBuffer = printBuffer + " " 
     while a < len(self.nestedItems): 
      self.nestedItems[a].printOut(printBuffer) 

    def setContentType(self,contentType): 
     self.contentType = contentType 

    def setValue(self,value): 
     self.value = value 



if type == "text only": 
    p = Parser(textList) 
    print p.returnParsed() 
# ---PROBLEM CODE STARTS HERE--- 
def separate(text,boundary = None): 
    pdb.set_trace() 
    if boundary == None: 
     m = re.findall(r'(?<=boundary=).*',text) 
     i = 0 
     textList = [text] 
     while i < len(m): #have all levels of Boundary/headers named 
      boundary = m[i] 
      textList = recursiveSplit(textList,boundary) 
      i += 1 

    return textList 

def recursiveSplit(chunk,boundary): 
    if type(chunk) is types.ListType: #<<--error occurs here 
     for obj in chunk: 
      recursiveSplit(obj,boundary) 
    if type(chunk) is types.StringType: 
     list = re.split(r'(?P<boundary>)(?!--)',chunk) 
     return list 
    return None 
#---PROBLEM CODE ENDS(?) HERE--- 

if type == "MIME": 
    #separate the text file instead by its boundary identifier 
    p = MIMEParser(separate(text)) 
    p.printAll() 

모든 MIME 유형 전자 메일을 사용할 수 있습니다. 여기에 내가 편리하게 사용하고있는 것이있다

MIME-Version: 1.0 
Received: by 10.112.170.40 with HTTP; Fri, 3 May 2013 05:08:21 -0700 (PDT) 
Date: Fri, 3 May 2013 08:08:21 -0400 
Delivered-To: [email protected] 
Message-ID: <@mail.gmail.com> 
Subject: MiB 5/3/13 7:43AM (EST) 
From: ME<[email protected]> 
To: SOMEONE <[email protected]> 
Content-Type: multipart/mixed; boundary=BNDRY1 

--BNDRY1 
Content-Type: multipart/alternative; boundary=BNDRY2 

--BNDRY2 
Content-Type: text/plain; charset=ISO-8859-1 

-changed signature methods to conform more to working clinic header 
methods(please test/not testable in simulator) 
-confirmed that signature image is showing up in simulator. Awaiting 
further tests 
-Modified findings spacing/buffer. See if you like it 

--BNDRY2 
Content-Type: text/html; charset=ISO-8859-1 

<div dir="ltr">-changed signature methods to conform more to working clinic header methods(please test/not testable in simulator)<div style>-confirmed that signature image is showing up in simulator. Awaiting further tests</div> 
<div style>-Modified findings spacing/buffer. See if you like it</div></div> 

--BNDRY2-- 
--BNDRY1 
Content-Type: application/zip; name="Make it Brief.ipa.zip" 
Content-Disposition: attachment; filename="Make it Brief.ipa.zip" 
Content-Transfer-Encoding: base64 
X-Attachment-Id: f_hg9biuno0 

<<FILE DATA>> 
--BNDRY1-- 
+0

최종 출력물은 무엇입니까? –

+0

BOUND 표시로 구분 된 목록과 문자열 목록을 얻길 바래요. 그래서 각 목록 요소는 BNDRY (X)로 시작하고 BNDRY라는 제목 아래에있는 정보 만 포함합니다 – Pinwheeler

+0

정규식이 올바르지 않다는 것을 알았습니다. BNDRY1을 하드 코드하면 예상대로 작동합니다 (BNDRY1). – Pinwheeler

답변

2

문제는 정규식에있다. 좀 더 시원한 방법이있을 수 있지만, 방금 변수를 기반으로 검색 문자열 리터럴을 만들었습니다.

def recursiveSplit(chunk,boundary): 
    if type(chunk) is types.StringType: 
     #ar = re.split(r'(?P<boundary>)(?!--)',chunk) 
     searchString = "--%s" % boundary 
     print searchString 
     ar = re.split(searchString,chunk) 
     return ar 
    if type(chunk) is types.ListType: 
     i = 0 
     while i < len(chunk): 
      chunk[i] = recursiveSplit(chunk[i],boundary) 
      i += 1 
     return obj 
+0

+1 자급 자족. – jpaugh

관련 문제