How to store data

Data storage tutorial
Some indicators need to refer to previous data as a reference when processing logic, such as the time of last processing, the result of last data processing, etc. Here we need to store the data in the process or the result data for subsequent use.
Currently, DB-level data storage is not supported, and support for Redis-level data storage is currently provided.

Set up a Redis connection

When testing and developing locally, set REDIS_URL = env.str('REDIS_URL', 'redis://127.0.0.1:6379/0'). in the crawlers/config.py file to the local RUL to use.

Redis usage

Some commonly used Redis operations are defined in the uitls/redis_conn.py file of the project library.
At present, only three methods are provided externally, and only three methods are supported, as shown in the following code:
import logging
import redis
import time
from crawlers.config import REDIS_URL
from urllib.parse import urlparse
_REDIS_URL = urlparse(REDIS_URL)
pool = redis.connection.ConnectionPool(
host=_REDIS_URL.hostname,
port=_REDIS_URL.port,
db=_REDIS_URL.path[1:],
)
_redis_client = redis.Redis(connection_pool=pool)
class rds:
@classmethod
def getex(cls, prefix, name):
"""
Return the value at key ``prefix + ':' + name``, or None if the key doesn't exist
prefix: The prefix parameter indicates the name value of the current crawler
name: Customize the name value related to the current business
"""
key = prefix + ':' + name
if len(key.encode()) > 1024:
logging.warning('Key is too large')
return None
value = _redis_client.get(key)
if value:
return str(value, encoding="utf-8")
return value
@classmethod
def setex(cls, prefix, name: str, value: str, ttl):
"""
Return the value at key ``prefix + ':' + name``, or None if the key doesn't exist
prefix: The prefix parameter indicates the name value of the current crawler
name: Customize the name value related to the current business. The key string size cannot exceed 1KB
value: The stored value cannot exceed 128 KB
ttl: Expiration time must be set, and value must be taken according to business requirements
"""
key = prefix + ':' + name
# Size limit, value cannot exceed 128 KB, key cannot exceed 1 KB
if len(value.encode()) > 1024 * 128 or len(key.encode()) > 1024:
logging.warning('Key or Value is too large')
return False
if _redis_client.get(key) is not None:
return False
if ttl:
_redis_client.set(key, value, ex=ttl)
else:
_redis_client.set(key, value)
return True
@classmethod
def get_and_set_key(cls, prefix, name: str, value: str, ttl: int = None):
"""
Return the value at key ``prefix + ':' + name``, or True if the key exist
prefix: The prefix parameter indicates the name value of the current crawler
name: Customize the name value related to the current business. The key string size cannot exceed 1KB
value: The stored value cannot exceed 128 KB
ttl: Expiration time must be set, and value must be taken according to business requirements
"""
key = prefix + ':' + name
# Size limit, value cannot exceed 128 KB, key cannot exceed 1 KB
if len(value.encode()) > 1024 * 128 or len(key.encode()) > 1024:
logging.warning('Key or Value is too large')
return False
if _redis_client.get(name):
return True
_redis_client.set(name, value)
if ttl:
_redis_client.expire(name, ttl)
@classmethod
def thing_lock(cls, name, expiration_time=2, time_out=3):
"""
code pessimistic locking
Function: Avoid simultaneous execution of functions, resulting in unpredictable problems
"""
def outer_func(func):
def wrapper_func(*args, **kwargs):
lock_name = f'lock:{name}'
end_time = time.time() + time_out
while time.time() < end_time:
if _redis_client.setnx(lock_name, expiration_time):
_redis_client.expire(lock_name, expiration_time)
data = func(*args, **kwargs)
_redis_client.delete(lock_name)
return data
time.sleep(0.001)
return func(*args, **kwargs)
return wrapper_func
return outer_func

Todo:

The prefix field is the prefix of the Redis key and is taken from the name value of the spider.

Demo

Set value:
import json
import scrapy
import time
import datetime
from crawlers.utils import SpiderBase, rds
from crawlers.utils.group_alarm import catch_except
from jinja2 import Template
class BtcArh999Spider(SpiderBase):
name = 'idx-btc-arh999'
url = 'https://fapi.coinglass.com/api/index/ahr999'
def start_requests(self):
yield scrapy.Request(url=self.url)
@catch_except
def parse(self, response, **kwargs):
data = response.json()['data']
params = {
'arh_999': round(data[-1]['ahr999'], 2),
'btc_price': data[-1]['value'],
'change': round(((float(data[-1]['value']) - float(data[-2]['value'])) / float(data[-2]['value'])) * 100, 2)
}
today_start_time = str(int(time.mktime(time.strptime(str(datetime.date.today()), '%Y-%m-%d'))) + 1)
value = rds.get(self.name, today_start_time)
if value is not None:
return
rds.set(self.name, today_start_time, json.dumps(params), 60 * 60 * 24 * 2)
print(Template(self.alert_en_template()).render(params))
print(Template(self.alert_cn_template()).render(params))
# must be declare
def alert_en_template(self):
return """The current BTC ahr999 (AHR Index) is {{arh_999}}. This spot is theoretically unsuitable for bottom fishing or long-term fixed investment. The current price of BTC is {{btc_price}}, and 24H change is {{change}}. (The above content does not constitute investment advice and is for your reference only. Invest at your own risk.)
"""
# must be declare
def alert_cn_template(self):
return """当前 BTC ahr999(九神指数)为 {{arh_999}},理论上不宜买入抄底或定投 BTC。当前 BTC 现价 {{btc_price}},24小时涨跌幅为 {{change}}。(以上内容仅供参考,非投资建议,风险自担。)
"""
Get value:
import json
import scrapy
import time
import datetime
from crawlers.utils import SpiderBase, rds
from crawlers.utils.group_alarm import catch_except
from jinja2 import Template
class BtcArh999Spider(SpiderBase):
name = 'idx-btc-arh999'
url = 'https://fapi.coinglass.com/api/index/ahr999'
def start_requests(self):
yield scrapy.Request(url=self.url)
@catch_except
def parse(self, response, **kwargs):
data = response.json()['data']
params = {
'arh_999': round(data[-1]['ahr999'], 2),
'btc_price': data[-1]['value'],
'change': round(((float(data[-1]['value']) - float(data[-2]['value'])) / float(data[-2]['value'])) * 100, 2)
}
today_start_time = str(int(time.mktime(time.strptime(str(datetime.date.today()), '%Y-%m-%d'))) + 1)
# redis get method
value = rds.get(self.name, today_start_time)
if value is not None:
return
# redis set method
rds.set(self.name, today_start_time, json.dumps(params), 60 * 60 * 24 * 2)
print(Template(self.alert_en_template()).render(params))
print(Template(self.alert_cn_template()).render(params))
# must be declare
def alert_en_template(self):
return """The current BTC ahr999 (AHR Index) is {{arh_999}}. This spot is theoretically unsuitable for bottom fishing or long-term fixed investment. The current price of BTC is {{btc_price}}, and 24H change is {{change}}. (The above content does not constitute investment advice and is for your reference only. Invest at your own risk.)
"""
# must be declare
def alert_cn_template(self):
return """当前 BTC ahr999(九神指数)为 {{arh_999}},理论上不宜买入抄底或定投 BTC。当前 BTC 现价 {{btc_price}},24小时涨跌幅为 {{change}}。(以上内容仅供参考,非投资建议,风险自担。)
"""