Skip to content

Commit

Permalink
Update for code
Browse files Browse the repository at this point in the history
  • Loading branch information
阳渠 committed Jan 31, 2024
1 parent 244a247 commit b0bbed5
Show file tree
Hide file tree
Showing 81 changed files with 9,718 additions and 6 deletions.
Binary file added MobileAgent/__pycache__/api.cpython-310.pyc
Binary file not shown.
Binary file added MobileAgent/__pycache__/chat.cpython-310.pyc
Binary file not shown.
Binary file not shown.
Binary file added MobileAgent/__pycache__/crop.cpython-310.pyc
Binary file not shown.
Binary file not shown.
Binary file added MobileAgent/__pycache__/prompt.cpython-310.pyc
Binary file not shown.
Binary file not shown.
35 changes: 35 additions & 0 deletions MobileAgent/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import base64
import requests

def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')


def inference_chat(chat, API_TOKEN):
api_url = 'https://api.openai.com/v1/chat/completions'
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {API_TOKEN}"
}

data = {
"model": 'gpt-4-vision-preview',
"messages": [],
"max_tokens": 2048,
}

for role, content in chat:
data["messages"].append({"role": role, "content": content})

while 1:
try:
res = requests.post(api_url, headers=headers, json=data)
res = res.json()['choices'][0]['message']['content']
except:
print("Network Error:")
print(res)
else:
break

return res
68 changes: 68 additions & 0 deletions MobileAgent/chat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import copy
from MobileAgent.api import encode_image


def init_chat(instruction):
operation_history = []
sysetm_prompt = "You are a helpful phone operating assistant. You need to help me operate the phone to complete my instruction.\n"
sysetm_prompt += f"My instruction is: {instruction}"
operation_history.append(["user", [{"type": "text", "text": sysetm_prompt}]])
operation_history.append(["assistant", [{"type": "text", "text": "Sure. How can I help you?"}]])
return operation_history


def add_response(role, prompt, chat_history, image=None):
new_chat_history = copy.deepcopy(chat_history)
if image:
base64_image = encode_image(image)
content = [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
},
]
else:
content = [
{
"type": "text",
"text": prompt
},
]
new_chat_history.append([role, content])
return new_chat_history


def add_multiimage_response(role, prompt, chat_history, images):
new_chat_history = copy.deepcopy(chat_history)
content = [
{
"type": "text",
"text": prompt
},
]
for image in images:
base64_image = encode_image(image)
this_content = {
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
content.append(this_content)

new_chat_history.append([role, content])
return new_chat_history


def print_status(chat_history):
print("*"*100)
for chat in chat_history:
print("role:", chat[0])
print(chat[1][0]["text"] + "<image>"*(len(chat[1])-1))
print("*"*100)
84 changes: 84 additions & 0 deletions MobileAgent/controller.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import time
import subprocess
from PIL import Image


def get_size(adb_path):
command = adb_path + " shell wm size"
result = subprocess.run(command, capture_output=True, text=True, shell=True)
resolution_line = result.stdout.strip().split('\n')[-1]
width, height = map(int, resolution_line.split(' ')[-1].split('x'))
return width, height


def get_screenshot(adb_path):
command = adb_path + " shell rm /sdcard/screenshot.png"
subprocess.run(command, capture_output=True, text=True, shell=True)
time.sleep(0.5)
command = adb_path + " shell screencap -p /sdcard/screenshot.png"
subprocess.run(command, capture_output=True, text=True, shell=True)
time.sleep(0.5)
command = adb_path + " pull /sdcard/screenshot.png ./screenshot"
subprocess.run(command, capture_output=True, text=True, shell=True)
image_path = "./screenshot/screenshot.png"
save_path = "./screenshot/screenshot.jpg"
image = Image.open(image_path)
original_width, original_height = image.size
new_width = int(original_width * 0.5)
new_height = int(original_height * 0.5)
resized_image = image.resize((new_width, new_height))
resized_image.convert("RGB").save(save_path, "JPEG")
time.sleep(1)


def tap(adb_path, x, y, px, py):
w = px
h = py
ax = int(x*w)
ay = int(y*h)
command = adb_path + f" shell input tap {ax} {ay}"
subprocess.run(command, capture_output=True, text=True, shell=True)
time.sleep(1)


def type(adb_path, text):
text = text.replace("\\n", "_").replace("\n", "_")
for char in text:
if char == ' ':
command = adb_path + f" shell input text %s"
subprocess.run(command, capture_output=True, text=True, shell=True)
elif char == '_':
command = adb_path + f" shell input keyevent 66"
subprocess.run(command, capture_output=True, text=True, shell=True)
elif 'a' <= char <= 'z' or 'A' <= char <= 'Z' or char.isdigit():
command = adb_path + f" shell input text {char}"
subprocess.run(command, capture_output=True, text=True, shell=True)
elif char in '-.,!?@\'°/:;()':
command = adb_path + f" shell input text \"{char}\""
subprocess.run(command, capture_output=True, text=True, shell=True)
else:
command = adb_path + f" shell am broadcast -a ADB_INPUT_TEXT --es msg \"{char}\""
subprocess.run(command, capture_output=True, text=True, shell=True)
time.sleep(1)


def slide(adb_path, action, x, y):
if "down" in action:
command = adb_path + f" shell input swipe {int(x/2)} {int(y/2)} {int(x/2)} {int(y/4)} 500"
subprocess.run(command, capture_output=True, text=True, shell=True)
elif "up" in action:
command = adb_path + f" shell input swipe {int(x/2)} {int(y/2)} {int(x/2)} {int(3*y/4)} 500"
subprocess.run(command, capture_output=True, text=True, shell=True)
time.sleep(1)


def back(adb_path):
command = adb_path + f" shell input keyevent 4"
subprocess.run(command, capture_output=True, text=True, shell=True)
time.sleep(1)


def back_to_desktop(adb_path):
command = adb_path + f" shell am start -a android.intent.action.MAIN -c android.intent.category.HOME"
subprocess.run(command, capture_output=True, text=True, shell=True)
time.sleep(1)
141 changes: 141 additions & 0 deletions MobileAgent/crop.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
import math
import cv2
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import clip
import torch


def crop_image(img, position):
def distance(x1,y1,x2,y2):
return math.sqrt(pow(x1 - x2, 2) + pow(y1 - y2, 2))
position = position.tolist()
for i in range(4):
for j in range(i+1, 4):
if(position[i][0] > position[j][0]):
tmp = position[j]
position[j] = position[i]
position[i] = tmp
if position[0][1] > position[1][1]:
tmp = position[0]
position[0] = position[1]
position[1] = tmp

if position[2][1] > position[3][1]:
tmp = position[2]
position[2] = position[3]
position[3] = tmp

x1, y1 = position[0][0], position[0][1]
x2, y2 = position[2][0], position[2][1]
x3, y3 = position[3][0], position[3][1]
x4, y4 = position[1][0], position[1][1]

corners = np.zeros((4,2), np.float32)
corners[0] = [x1, y1]
corners[1] = [x2, y2]
corners[2] = [x4, y4]
corners[3] = [x3, y3]

img_width = distance((x1+x4)/2, (y1+y4)/2, (x2+x3)/2, (y2+y3)/2)
img_height = distance((x1+x2)/2, (y1+y2)/2, (x4+x3)/2, (y4+y3)/2)

corners_trans = np.zeros((4,2), np.float32)
corners_trans[0] = [0, 0]
corners_trans[1] = [img_width - 1, 0]
corners_trans[2] = [0, img_height - 1]
corners_trans[3] = [img_width - 1, img_height - 1]

transform = cv2.getPerspectiveTransform(corners, corners_trans)
dst = cv2.warpPerspective(img, transform, (int(img_width), int(img_height)))
return dst


def calculate_size(box):
return (box[2]-box[0]) * (box[3]-box[1])


def calculate_iou(box1, box2):
xA = max(box1[0], box2[0])
yA = max(box1[1], box2[1])
xB = min(box1[2], box2[2])
yB = min(box1[3], box2[3])

interArea = max(0, xB - xA) * max(0, yB - yA)
box1Area = (box1[2] - box1[0]) * (box1[3] - box1[1])
box2Area = (box2[2] - box2[0]) * (box2[3] - box2[1])
unionArea = box1Area + box2Area - interArea
iou = interArea / unionArea

return iou


def crop(image, box, i, text_data=None):
image = Image.open(image)

if text_data:
draw = ImageDraw.Draw(image)
draw.rectangle(((text_data[0], text_data[1]), (text_data[2], text_data[3])), outline="red", width=5)
font_size = int((text_data[3] - text_data[1])*0.75)
font = ImageFont.truetype("arial.ttf", font_size)
draw.text((text_data[0]+5, text_data[1]+5), str(i), font=font, fill="red")

cropped_image = image.crop(box)
cropped_image.save(f"./temp/{i}.jpg")


def in_box(box, target):
if (box[0] > target[0]) and (box[1] > target[1]) and (box[2] < target[2]) and (box[3] < target[3]):
return True
else:
return False


def crop_for_clip(image, box, i, position):
image = Image.open(image)
w, h = image.size
if position == "left":
bound = [0, 0, w/2, h]
elif position == "right":
bound = [w/2, 0, w, h]
elif position == "top":
bound = [0, 0, w, h/2]
elif position == "bottom":
bound = [0, h/2, w, h]
elif position == "top left":
bound = [0, 0, w/2, h/2]
elif position == "top right":
bound = [w/2, 0, w, h/2]
elif position == "bottom left":
bound = [0, h/2, w/2, h]
elif position == "bottom right":
bound = [w/2, h/2, w, h]
else:
bound = [0, 0, w, h]

if in_box(box, bound):
cropped_image = image.crop(box)
cropped_image.save(f"./temp/{i}.jpg")
return True
else:
return False


def clip_for_icon(clip_model, clip_preprocess, images, prompt):
image_features = []
for image_file in images:
image = clip_preprocess(Image.open(image_file)).unsqueeze(0)
image_feature = clip_model.encode_image(image)
image_features.append(image_feature)
image_features = torch.cat(image_features)

text = clip.tokenize([prompt])
text_features = clip_model.encode_text(text)

image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=0).squeeze(0)
_, max_pos = torch.max(similarity, dim=0)
pos = max_pos.item()

return pos
Loading

0 comments on commit b0bbed5

Please sign in to comment.