import sys, os, asyncio, json, re, time, pathlib, random
sys.path.insert(0,'/opt/data/scripts')
import cbm_cdp
LOG='/opt/data/agent-state/memory/ig-encourage-agent-log.json'
ACCOUNTS=['ourdailybread','proverbs31ministries','lysaterkeurst','youversion','bibleproject','shereadstruth','desiringgod','christianitytoday']
SAFE_WARN=re.compile(r'try again later|captcha|suspicious|checkpoint|challenge|temporarily blocked|restrict certain activity|automated behavior|confirm (?:it|your account)|unusual activity|action blocked|we limit|help us confirm',re.I)
BAD=re.compile(r'\b(death|died|funeral|suicide|self-harm|murder|kill|grief|grieving|rip|condolence|politic|election|trump|biden|lgbt|transgender|abortion|donate|fundraiser|cashapp|onlyfans)\b',re.I)
PROMO=re.compile(r'comment\s+(?:[A-Z"“]|.*link)|link in bio|out now|stream|tour|ticket|pre-?order|sale|download.*app|app!|full devotional online or in our app',re.I)
CHRIST=re.compile(r'Jesus|God|Lord|Christ|Bible|Scripture|pray|prayer|faith|grace|mercy|gospel|worship|church|Psalm|Romans|Father|Spirit|hope|peace|joy',re.I)

def load_ids():
 try:
  d=json.load(open(LOG)); return {c.get('postId') for c in d.get('comments',[]) if c.get('verified') and c.get('postId')}
 except Exception: return set()
def pid(url):
 m=re.search(r'/(?:p|reel)/([A-Za-z0-9_-]+)',url); return m.group(1) if m else None
async def state(b):
 return await b.eval("""(() => ({url:location.href,text:document.body.innerText.slice(0,5000),hasLogin:!!document.querySelector('input[name="username"],input[name="password"]'),warning:/try again later|captcha|suspicious|checkpoint|challenge|temporarily blocked|restrict certain activity|automated behavior|confirm (?:it|your account)|unusual activity|action blocked|we limit|help us confirm/i.test(document.body.innerText)}))()""")
async def collect_links_from_profile(b,acct):
 await b.goto(f'https://www.instagram.com/{acct}/',wait=5)
 links=[]
 for i in range(3):
  got=await b.eval("""(() => [...new Set([...document.querySelectorAll('a[href*="/p/"],a[href*="/reel/"]')].map(a=>a.href.split('?')[0]))].slice(0,12))()""")
  for u in got or []:
   if u not in links: links.append(u)
  await b.eval('window.scrollBy(0,900)'); await asyncio.sleep(1.5)
 return links[:10]
async def extract_post(b,u,acct,seen_ids):
 await b.goto(u,wait=5)
 st=await state(b)
 if st['hasLogin'] or st['warning']: return {'url':u,'account':acct,'reject':'login/warning','state':st}
 text=st['text'] or ''
 post=pid(u)
 if not post or post in seen_ids: return {'url':u,'account':acct,'postId':post,'reject':'already logged'}
 if re.search(r'vincent\.emmanuel\.lee|Vincent Emmanuel Lee',text): return {'url':u,'account':acct,'postId':post,'reject':'visible Vincent comment'}
 if BAD.search(text): return {'url':u,'account':acct,'postId':post,'reject':'sensitive/unsafe keywords','excerpt':text[:900]}
 if PROMO.search(text): return {'url':u,'account':acct,'postId':post,'reject':'promotional/app/link prompt','excerpt':text[:900]}
 if not CHRIST.search(text): return {'url':u,'account':acct,'postId':post,'reject':'not clearly Christian','excerpt':text[:900]}
 # get focused caption-ish lines before comments numbers
 return {'url':u,'account':acct,'postId':post,'text':text[:1800]}
async def main():
 cbm_cdp.launch_profile(); b=await cbm_cdp.connect(prefer_instagram=True)
 st=await state(b)
 if st['hasLogin'] or st['warning']:
  print(json.dumps({'fatal':'login_or_warning','state':st},ensure_ascii=False)); return
 seen=load_ids(); out=[]; skipped=[]; all_links=[]
 # home feed first
 await b.goto('https://www.instagram.com/',wait=5)
 for i in range(4):
  links=await b.eval("""(() => [...new Set([...document.querySelectorAll('article a[href*="/p/"],article a[href*="/reel/"]')].map(a=>a.href.split('?')[0]))])()""")
  for u in links or []:
   if u not in all_links: all_links.append(u)
  await b.eval('window.scrollBy(0,1000)'); await asyncio.sleep(2)
 for acct in ACCOUNTS:
  try:
   for u in await collect_links_from_profile(b,acct):
    if u not in all_links: all_links.append(u)
  except Exception as e: skipped.append({'account':acct,'reject':'profile collect error','error':str(e)[:200]})
  if len(all_links)>70: break
 for u in all_links:
  acct=(re.search(r'instagram\.com/([^/]+)/',u).group(1) if re.search(r'instagram\.com/([^/]+)/',u) else '')
  try:
   item=await extract_post(b,u,acct,seen)
  except Exception as e:
   item={'url':u,'account':acct,'reject':'extract error','error':str(e)[:200]}
  if 'text' in item: out.append(item)
  else: skipped.append(item)
  if len(out)>=16: break
 print(json.dumps({'candidates':out,'skipped':skipped[:80],'totalLinks':len(all_links)},ensure_ascii=False,indent=2))
 await b.close()
asyncio.run(main())
