PyCURL이 헤더보다 먼저 처리됩니다.

this question에 대한 답변으로 영감을 받았습니다. requests과 같은 인터페이스로 PyCurl을 래핑하려고합니다. Everythig는 괜찮을 것이지만, 헤더에서 본문 인코딩을 읽는 방법을 설명하는 PyCURL docs을 따르면, 다음과 같은 문제가 발생합니다. 헤더 콜백은 모든 응답 헤더에 대해 호출되지만 반복기가 응답 행을 시작한 후에 만 인코딩/문자 세트 감지를 무의미하게 만듭니다. 여기 PyCURL이 헤더보다 먼저 처리됩니다.

코드입니다 :

import re 
import io 
import urllib 
import urllib.error 
import http 

import pycurl 


class CurlHTTPStream(object): 

    SELECT_TIMEOUT = 10 
    HTTP_STANDARD_ENCODING = 'iso-8859-1' 

    def __init__(self, method, url, data=None, params=None, headers=None): 
     self.url = url 
     self.received_buffer = io.BytesIO() 

     self.curl = pycurl.Curl() 
     self.curl.setopt(pycurl.CUSTOMREQUEST, method) 
     if headers: 
      self.curl.setopt(
       pycurl.HTTPHEADER, 
       [ 
        '{}: {}'.format(key, value) 
        for key, value in headers.items() 
       ] 
      ) 
     if params: 
      query_string = '&'.join((
       '{}={}'.format(key, value) 
       for key, value in params.items() 
      )) 
      url = '{}?{}'.format(url, query_string) 
     self.curl.setopt(pycurl.URL, url) 
     self.curl.setopt(pycurl.ENCODING, 'gzip') 
     self.curl.setopt(pycurl.CONNECTTIMEOUT, 5) 
     self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function) 
     self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write) 

     self.curl_multi = pycurl.CurlMulti() 
     self.curl_multi.add_handle(self.curl) 

     self.status_code = 0 
     self.headers = {} 

    def _any_data_received(self): 
     return self.received_buffer.tell() != 0 

    def _get_received_data(self): 
     result = self.received_buffer.getvalue() 
     self.received_buffer.truncate(0) 
     self.received_buffer.seek(0) 
     return result 

    def _check_status_code(self): 
     if self.status_code == 0: 
      self.status_code = self.curl.getinfo(pycurl.HTTP_CODE) 
     if self.status_code != 0 and self.status_code != http.HTTPStatus.OK: 
      raise urllib.error.HTTPError(
       self.url, self.status_code, None, None, None 
      ) 

    def _perform_on_curl(self): 
     while True: 
      ret, num_handles = self.curl_multi.perform() 
      if ret != pycurl.E_CALL_MULTI_PERFORM: 
       break 
     return num_handles 

    def _iter_chunks(self): 
     while True: 
      remaining = self._perform_on_curl() 
      if self._any_data_received(): 
       self._check_status_code() 
       yield self._get_received_data() 
      if remaining == 0: 
       break 
      self.curl_multi.select(self.SELECT_TIMEOUT) 

     self._check_status_code() 
     self._check_curl_errors() 

    def _check_curl_errors(self): 
     for f in self.curl_multi.info_read()[2]: 
      raise pycurl.error(*f[1:]) 

    def iter_lines(self): 
     chunks = self._iter_chunks() 
     return self._split_lines_from_chunks(chunks) 

    def _split_lines_from_chunks(self, chunks): 
     print('foo') 
     print(self.headers) 
     charset = None 
     if 'content-type' in self.headers: 
      content_type = self.headers['content-type'].lower() 
      match = re.search('charset=(\S+)', content_type) 
      if match: 
       charset = match.group(1) 
       print('Decoding using %s' % charset) 
     if charset is None: 
      charset = self.HTTP_STANDARD_ENCODING 
      print('Assuming encoding is %s' % charset) 
     pending = None 
     for chunk in chunks: 
      if pending is not None: 
       chunk = pending + chunk 
      lines = chunk.splitlines() 
      if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]: 
       pending = lines.pop() 
      else: 
       pending = None 
      for line in lines: 
       yield line.decode(charset) 
     if pending is not None: 
      yield pending.decode(charset) 

    def header_function(self, header_line): 
     print('hello') 
     header_line = header_line.decode(self.HTTP_STANDARD_ENCODING) 
     if ':' not in header_line: 
      return 
     name, value = header_line.split(':', 1) 
     name = name.strip() 
     value = value.strip() 
     name = name.lower() 
     self.headers[name] = value 


def request(method, url, data=None, params=None, headers=None, 
      stream=False): 
    if stream: 
     return CurlHTTPStream(method, url, data=data, params=params, 
           headers=headers)

그리고 내가 그것을 테스트 할 때 터미널에서 일어나는 내용은 다음과 같습니다

Python 3.5.1 (default, Dec 09 2015, 07:29:36) [GCC] on linux 
Type "help", "copyright", "credits" or "license" for more information. 
>>> from pycurl_requests.requests import request 
>>> r = request('GET', 'http://my-couchdb-instance:5984/user-30323561366530622d336135622d343637372d386464392d613038653536663865636566/_changes', params={'feed': 'continuous'}, stream=True) 
>>> for l in r.iter_lines(): 
...  print(l) 
... 
foo 
{} 
Assuming encoding is iso-8859-1 
hello 
hello 
hello 
hello 
hello 
hello 
hello 
{"seq":1,"id":"account","changes":[{"rev":"1-806053b347406e04d1872e13199fd3cf"}]} 
{"seq":4,"id":"identity-bd2c5007-9df3-4ece-9751-843bf5523edd","changes":[{"rev":"1-e3a98ec37776f2cb479b2dcae0266700"}]} 
{"seq":5,"id":"section_phone-0342667c-ecbd-401f-acfe-7bb2a1aa3159","changes":[{"rev":"1-457342bc895c7cb6924ceabd07e1ffcf"}]}

가 CouchDB를 변화에서 오는 더 선은 공급하지만 출력을 잘립니다 관련성이 없으므로

기본적으로 출력의 foo은 헤더가 제 위치에 있어야하지만 다음 줄에 self.headers이 비어있는 블록을 입력한다는 것을 나타냅니다. 에 대한 모든 전화는 복수 hello을 의미합니다. 헤더 콜백이 트리거되기 전에 몸체를 쓰는 쓰기 콜백이 BytesIO에 어떻게 호출 될 수 있습니까?

출처

2016-07-14 ElmoVanKielmo

해결책을 찾았습니다. 문제는 응답이 오기 전에 _split_lines_from_chunks(self, chunks)이 trigerred 되었기 때문에 헤더도 아직 존재하지 않았다는 것입니다.

다음은 작동하는 코드입니다. 첫 번째 본문을 사용할 수있게되면 charset이 감지되므로 모든 헤더가 이미 처리되었습니다.

import re 
import io 
import urllib 
import urllib.error 
import http 

import pycurl 


class CurlHTTPStream(object): 

    SELECT_TIMEOUT = 10 
    HTTP_STANDARD_ENCODING = 'iso-8859-1' 

    def __init__(self, method, url, data=None, params=None, headers=None): 
     self.url = url 
     self.received_buffer = io.BytesIO() 

     self.curl = pycurl.Curl() 
     self.curl.setopt(pycurl.CUSTOMREQUEST, method) 
     if headers: 
      self.curl.setopt(
       pycurl.HTTPHEADER, 
       [ 
        '{}: {}'.format(key, value) 
        for key, value in headers.items() 
       ] 
      ) 
     if params: 
      query_string = '&'.join((
       '{}={}'.format(key, value) 
       for key, value in params.items() 
      )) 
      url = '{}?{}'.format(url, query_string) 
     self.curl.setopt(pycurl.URL, url) 
     self.curl.setopt(pycurl.ENCODING, 'gzip') 
     self.curl.setopt(pycurl.CONNECTTIMEOUT, 5) 
     self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function) 
     self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write) 

     self.curl_multi = pycurl.CurlMulti() 
     self.curl_multi.add_handle(self.curl) 

     self.status_code = 0 
     self.headers = {} 
     self._charset = None 

    def _any_data_received(self): 
     return self.received_buffer.tell() != 0 

    def _get_received_data(self): 
     result = self.received_buffer.getvalue() 
     self.received_buffer.truncate(0) 
     self.received_buffer.seek(0) 
     return result 

    def _check_status_code(self): 
     if self.status_code == 0: 
      self.status_code = self.curl.getinfo(pycurl.HTTP_CODE) 
     if self.status_code != 0 and self.status_code != http.HTTPStatus.OK: 
      raise urllib.error.HTTPError(
       self.url, self.status_code, None, None, None 
      ) 

    def _perform_on_curl(self): 
     while True: 
      ret, num_handles = self.curl_multi.perform() 
      if ret != pycurl.E_CALL_MULTI_PERFORM: 
       break 
     return num_handles 

    def _iter_chunks(self): 
     while True: 
      remaining = self._perform_on_curl() 
      if self._any_data_received(): 
       self._check_status_code() 
       yield self._get_received_data() 
      if remaining == 0: 
       break 
      self.curl_multi.select(self.SELECT_TIMEOUT) 

     self._check_status_code() 
     self._check_curl_errors() 

    def _check_curl_errors(self): 
     for f in self.curl_multi.info_read()[2]: 
      raise pycurl.error(*f[1:]) 

    def iter_lines(self): 
     chunks = self._iter_chunks() 
     return self._split_lines_from_chunks(chunks) 

    def _split_lines_from_chunks(self, chunks): 
     print('foo') 
     print(self.headers) 
     pending = None 
     for chunk in chunks: 
      if pending is not None: 
       chunk = pending + chunk 
      lines = chunk.splitlines() 
      if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]: 
       pending = lines.pop() 
      else: 
       pending = None 
      for line in lines: 
       yield line.decode(self.charset) 
     if pending is not None: 
      yield pending.decode(self.charset) 

    @property 
    def charset(self): 
     if self._charset is not None: 
      return self._charset 
     try: 
      content_type = self.headers['content-type'].lower() 
      match = re.search('charset=(\S+)', content_type) 
      if match: 
       self._charset = match.group(1).strip() 
       print('Decoding using %s' % self._charset) 
      else: 
       raise KeyError('charset') 
     except KeyError: 
      self._charset = self.HTTP_STANDARD_ENCODING 
      print('Assuming encoding is %s' % self._charset) 
     return self._charset 

    def header_function(self, header_line): 
     print('hello') 
     header_line = header_line.decode(self.HTTP_STANDARD_ENCODING) 
     if ':' not in header_line: 
      return 
     name, value = header_line.split(':', 1) 
     name = name.strip() 
     value = value.strip() 
     name = name.lower() 
     self.headers[name] = value 


def request(method, url, data=None, params=None, headers=None, 
      stream=False): 
    if stream: 
     return CurlHTTPStream(method, url, data=data, params=params, 
           headers=headers)

출처

2016-07-14 12:21:47 ElmoVanKielmo

PyCURL이 헤더보다 먼저 처리됩니다.

답변

관련 문제